mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-24 13:50:37 +00:00
Compare commits
53 Commits
al/support
...
neon_basic
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec672723fa | ||
|
|
b9aa38358f | ||
|
|
d4b64b9ef7 | ||
|
|
b6f5c395cb | ||
|
|
b9a7a661d0 | ||
|
|
48ce95533c | ||
|
|
874c31976e | ||
|
|
231d7a7616 | ||
|
|
5705413d90 | ||
|
|
35370f967f | ||
|
|
b98419ee56 | ||
|
|
86a61b318b | ||
|
|
5f8fd640bf | ||
|
|
916a5871a6 | ||
|
|
700d929529 | ||
|
|
520046f5bd | ||
|
|
2ebd2ce2b6 | ||
|
|
bcc2aee704 | ||
|
|
6d023484ed | ||
|
|
062159ac17 | ||
|
|
f2e2b8a7f4 | ||
|
|
f9214771b4 | ||
|
|
77a68326c5 | ||
|
|
a25504deae | ||
|
|
294b8a8fde | ||
|
|
407a20ceae | ||
|
|
e5b7ddfeee | ||
|
|
7feb0d1a80 | ||
|
|
457e3a3ebc | ||
|
|
25d2f4b669 | ||
|
|
1685593f38 | ||
|
|
8d0f4a7857 | ||
|
|
3fc3666df7 | ||
|
|
89746a48c6 | ||
|
|
8d27a9c54e | ||
|
|
d98cb39978 | ||
|
|
27c73c8740 | ||
|
|
9e871318a0 | ||
|
|
e1061879aa | ||
|
|
f09e82270e | ||
|
|
d4a5fd5258 | ||
|
|
921bb86909 | ||
|
|
1e7db5458f | ||
|
|
b4d36f572d | ||
|
|
762a8a7bb5 | ||
|
|
2e8a3afab1 | ||
|
|
4580f5085a | ||
|
|
e074ccf170 | ||
|
|
196943c78f | ||
|
|
149dd36b6b | ||
|
|
be271e3edf | ||
|
|
7c85c7ea91 | ||
|
|
1066bca5e3 |
@@ -18,8 +18,8 @@
|
||||
!trace/
|
||||
!vendor/postgres-v14/
|
||||
!vendor/postgres-v15/
|
||||
!vendor/postgres-v16/
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
!scripts/combine_control_files.py
|
||||
!vm-cgconfig.conf
|
||||
|
||||
10
.github/actions/run-python-test-set/action.yml
vendored
10
.github/actions/run-python-test-set/action.yml
vendored
@@ -150,6 +150,14 @@ runs:
|
||||
EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
# We use pytest-split plugin to run benchmarks in parallel on different CI runners
|
||||
if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
|
||||
mkdir -p $TEST_OUTPUT
|
||||
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
|
||||
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
@@ -201,4 +209,4 @@ runs:
|
||||
uses: ./.github/actions/allure-report-store
|
||||
with:
|
||||
report-dir: /tmp/test_output/allure/results
|
||||
unique-key: ${{ inputs.build_type }}
|
||||
unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}
|
||||
|
||||
50
.github/workflows/build_and_test.yml
vendored
50
.github/workflows/build_and_test.yml
vendored
@@ -396,13 +396,11 @@ jobs:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
pytest_split_group: [ 1, 2, 3, 4 ]
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Pytest benchmarks
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
@@ -411,9 +409,11 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits ${{ strategy.job-total }} --group ${{ matrix.pytest_split_group }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -955,22 +955,15 @@ jobs:
|
||||
version: [ v14, v15 ]
|
||||
|
||||
env:
|
||||
# While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
|
||||
# Later all the extensions will be moved to extensions image.
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
|
||||
COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
|
||||
EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
|
||||
AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
|
||||
S3_BUCKETS: |
|
||||
${{ github.ref_name == 'release' &&
|
||||
'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
|
||||
'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
|
||||
S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
|
||||
|
||||
steps:
|
||||
- name: Pull postgres-extensions image
|
||||
run: |
|
||||
docker pull ${EXTENSIONS_IMAGE}
|
||||
docker pull ${COMPUTE_NODE_IMAGE}
|
||||
|
||||
- name: Create postgres-extensions container
|
||||
id: create-container
|
||||
@@ -978,44 +971,23 @@ jobs:
|
||||
EID=$(docker create ${EXTENSIONS_IMAGE} true)
|
||||
echo "EID=${EID}" >> $GITHUB_OUTPUT
|
||||
|
||||
CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
|
||||
echo "CID=${CID}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Extract postgres-extensions from container
|
||||
run: |
|
||||
rm -rf ./extensions-to-upload ./custom-extensions # Just in case
|
||||
rm -rf ./extensions-to-upload # Just in case
|
||||
mkdir -p extensions-to-upload
|
||||
|
||||
# In compute image we have a bit different directory layout
|
||||
mkdir -p extensions-to-upload/share
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
|
||||
docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib
|
||||
|
||||
# Delete Neon extensitons (they always present on compute-node image)
|
||||
rm -rf ./extensions-to-upload/share/extension/neon*
|
||||
rm -rf ./extensions-to-upload/lib/neon*
|
||||
|
||||
# Delete leftovers from the extension build step
|
||||
rm -rf ./extensions-to-upload/lib/pgxs
|
||||
rm -rf ./extensions-to-upload/lib/pkgconfig
|
||||
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
|
||||
for EXT_NAME in $(ls ./custom-extensions); do
|
||||
mkdir -p ./extensions-to-upload/${EXT_NAME}/share
|
||||
|
||||
mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
|
||||
mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib
|
||||
done
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
|
||||
docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
|
||||
|
||||
- name: Upload postgres-extensions to S3
|
||||
run: |
|
||||
for BUCKET in $(echo ${S3_BUCKETS}); do
|
||||
for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
|
||||
aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
|
||||
done
|
||||
|
||||
- name: Cleanup
|
||||
if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
|
||||
if: ${{ always() && steps.create-container.outputs.EID }}
|
||||
run: |
|
||||
docker rm ${{ steps.create-container.outputs.CID }} || true
|
||||
docker rm ${{ steps.create-container.outputs.EID }} || true
|
||||
|
||||
deploy:
|
||||
|
||||
4
.gitmodules
vendored
4
.gitmodules
vendored
@@ -6,7 +6,3 @@
|
||||
path = vendor/postgres-v15
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_15_STABLE_neon
|
||||
[submodule "vendor/postgres-v16"]
|
||||
path = vendor/postgres-v16
|
||||
url = https://github.com/neondatabase/postgres.git
|
||||
branch = REL_16_STABLE_neon
|
||||
|
||||
36
Cargo.lock
generated
36
Cargo.lock
generated
@@ -2506,6 +2506,7 @@ dependencies = [
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
"svg_fmt",
|
||||
"tokio",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -2544,6 +2545,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"nix",
|
||||
"num-traits",
|
||||
"num_cpus",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"pin-project-lite",
|
||||
@@ -2780,7 +2782,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2793,7 +2795,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -2804,7 +2806,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -2822,7 +2824,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -3854,7 +3856,8 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.4"
|
||||
source = "git+https://github.com/neondatabase/sharded-slab.git?rev=98d16753ab01c61f0a028de44167307a00efea00#98d16753ab01c61f0a028de44167307a00efea00"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
@@ -4098,7 +4101,7 @@ checksum = "4b55807c0344e1e6c04d7c965f5289c39a8d94ae23ed5c0b57aabac549f871c6"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"libc",
|
||||
"xattr",
|
||||
"xattr 0.2.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4311,7 +4314,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
@@ -4379,16 +4382,17 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tokio-tar"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-tar.git?rev=404df61437de0feef49ba2ccdbdd94eb8ad6e142#404df61437de0feef49ba2ccdbdd94eb8ad6e142"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9d5714c010ca3e5c27114c1cdeb9d14641ace49874aa5626d7149e47aedace75"
|
||||
dependencies = [
|
||||
"filetime",
|
||||
"futures-core",
|
||||
"libc",
|
||||
"redox_syscall 0.2.16",
|
||||
"redox_syscall 0.3.5",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"xattr",
|
||||
"xattr 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4865,6 +4869,7 @@ dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
@@ -5362,6 +5367,15 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ea263437ca03c1522846a4ddafbca2542d0ad5ed9b784909d4b27b76f62bc34a"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xmlparser"
|
||||
version = "0.13.5"
|
||||
|
||||
19
Cargo.toml
19
Cargo.toml
@@ -124,6 +124,7 @@ tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.9.0"
|
||||
tokio-rustls = "0.23"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
toml = "0.7"
|
||||
toml_edit = "0.19"
|
||||
@@ -143,12 +144,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -183,12 +183,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
|
||||
# Changes the MAX_THREADS limit from 4096 to 32768.
|
||||
# This is a temporary workaround for using tracing from many threads in safekeepers code,
|
||||
# until async safekeepers patch is merged to the main.
|
||||
sharded-slab = { git = "https://github.com/neondatabase/sharded-slab.git", rev="98d16753ab01c61f0a028de44167307a00efea00" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ WORKDIR /home/nonroot
|
||||
|
||||
COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
||||
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
||||
COPY --chown=nonroot vendor/postgres-v16 vendor/postgres-v16
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
@@ -40,7 +39,6 @@ ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
|
||||
COPY --chown=nonroot . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
@@ -81,7 +79,6 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy
|
||||
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v16 /usr/local/v16/
|
||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
|
||||
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
|
||||
zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev
|
||||
libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -77,6 +77,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
|
||||
echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
|
||||
mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
./autogen.sh && \
|
||||
./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -89,17 +90,28 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
|
||||
mkdir -p /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
|
||||
cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
|
||||
|
||||
RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
|
||||
echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -419,12 +431,16 @@ RUN apt-get update && \
|
||||
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
|
||||
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
mkdir build && cd build && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -535,10 +551,10 @@ FROM build-deps AS pg-embedding-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
|
||||
# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
|
||||
# There is no release tag yet
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
|
||||
echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -553,16 +569,17 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1
|
||||
FROM build-deps AS pg-anon-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
|
||||
echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
|
||||
find /usr/local/pgsql -type f | sort > /before.txt && \
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
|
||||
find /usr/local/pgsql -type f | sort > /after.txt && \
|
||||
/bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
|
||||
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
|
||||
mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
|
||||
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
|
||||
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -754,16 +771,23 @@ RUN rm /usr/local/pgsql/lib/lib*.a
|
||||
# Extenstion only
|
||||
#
|
||||
#########################################################################################
|
||||
FROM python:3.9-slim-bullseye AS generate-ext-index
|
||||
ARG PG_VERSION
|
||||
ARG BUILD_TAG
|
||||
RUN apt update && apt install -y zstd
|
||||
|
||||
# copy the control files here
|
||||
COPY --from=kq-imcx-pg-build /extensions/ /extensions/
|
||||
COPY --from=pg-anon-pg-build /extensions/ /extensions/
|
||||
COPY --from=postgis-build /extensions/ /extensions/
|
||||
COPY scripts/combine_control_files.py ./combine_control_files.py
|
||||
RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
|
||||
|
||||
FROM scratch AS postgres-extensions
|
||||
# After the transition this layer will include all extensitons.
|
||||
# As for now, it's only for new custom ones
|
||||
#
|
||||
# # Default extensions
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
|
||||
# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib
|
||||
# Custom extensions
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
|
||||
COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
|
||||
# As for now, it's only a couple for testing purposses
|
||||
COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
|
||||
COPY --from=generate-ext-index /ext_index.json /ext_index.json
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
17
Makefile
17
Makefile
@@ -83,8 +83,6 @@ $(POSTGRES_INSTALL_DIR)/build/%/config.status:
|
||||
# I'm not sure why it wouldn't work, but this is the only place (apart from
|
||||
# the "build-all-versions" entry points) where direct mention of PostgreSQL
|
||||
# versions is used.
|
||||
.PHONY: postgres-configure-v16
|
||||
postgres-configure-v16: $(POSTGRES_INSTALL_DIR)/build/v16/config.status
|
||||
.PHONY: postgres-configure-v15
|
||||
postgres-configure-v15: $(POSTGRES_INSTALL_DIR)/build/v15/config.status
|
||||
.PHONY: postgres-configure-v14
|
||||
@@ -167,33 +165,28 @@ neon-pg-ext-clean-%:
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
neon-pg-ext-v15 \
|
||||
neon-pg-ext-v16
|
||||
neon-pg-ext-v15
|
||||
|
||||
.PHONY: neon-pg-ext-clean
|
||||
neon-pg-ext-clean: \
|
||||
neon-pg-ext-clean-v14 \
|
||||
neon-pg-ext-clean-v15 \
|
||||
neon-pg-ext-clean-v16
|
||||
neon-pg-ext-clean-v15
|
||||
|
||||
# shorthand to build all Postgres versions
|
||||
.PHONY: postgres
|
||||
postgres: \
|
||||
postgres-v14 \
|
||||
postgres-v15 \
|
||||
postgres-v16
|
||||
postgres-v15
|
||||
|
||||
.PHONY: postgres-headers
|
||||
postgres-headers: \
|
||||
postgres-headers-v14 \
|
||||
postgres-headers-v15 \
|
||||
postgres-headers-v16
|
||||
postgres-headers-v15
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean: \
|
||||
postgres-clean-v14 \
|
||||
postgres-clean-v15 \
|
||||
postgres-clean-v16
|
||||
postgres-clean-v15
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
|
||||
@@ -223,9 +223,8 @@ fn main() -> Result<()> {
|
||||
drop(state);
|
||||
|
||||
// Launch remaining service threads
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
let _configurator_handle =
|
||||
launch_configurator(&compute).expect("cannot launch configurator thread");
|
||||
let _monitor_handle = launch_monitor(&compute);
|
||||
let _configurator_handle = launch_configurator(&compute);
|
||||
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
|
||||
@@ -8,9 +8,11 @@ use std::sync::{Condvar, Mutex};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
use tracing::{error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -21,6 +23,7 @@ use utils::measured_stream::MeasuredReader;
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
@@ -86,6 +89,7 @@ pub struct ParsedSpec {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
pub pageserver_connstr: String,
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
pub storage_auth_token: Option<String>,
|
||||
}
|
||||
|
||||
@@ -103,6 +107,21 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
.clone()
|
||||
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
|
||||
.ok_or("pageserver connstr should be provided")?;
|
||||
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
|
||||
if matches!(spec.mode, ComputeMode::Primary) {
|
||||
spec.cluster
|
||||
.settings
|
||||
.find("neon.safekeepers")
|
||||
.ok_or("safekeeper connstrings should be provided")?
|
||||
.split(',')
|
||||
.map(|str| str.to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
} else {
|
||||
spec.safekeeper_connstrings.clone()
|
||||
};
|
||||
let storage_auth_token = spec.storage_auth_token.clone();
|
||||
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
|
||||
tenant_id
|
||||
@@ -128,6 +147,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
Ok(ParsedSpec {
|
||||
spec,
|
||||
pageserver_connstr,
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
@@ -309,6 +329,102 @@ impl ComputeNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_safekeepers_synced_async(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
) -> Result<Option<Lsn>> {
|
||||
// Construct a connection config for each safekeeper
|
||||
let pspec: ParsedSpec = compute_state
|
||||
.pspec
|
||||
.as_ref()
|
||||
.expect("spec must be set")
|
||||
.clone();
|
||||
let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
|
||||
let sk_configs = sk_connstrs.into_iter().map(|connstr| {
|
||||
// Format connstr
|
||||
let id = connstr.clone();
|
||||
let connstr = format!("postgresql://no_user@{}", connstr);
|
||||
let options = format!(
|
||||
"-c timeline_id={} tenant_id={}",
|
||||
pspec.timeline_id, pspec.tenant_id
|
||||
);
|
||||
|
||||
// Construct client
|
||||
let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
|
||||
config.options(&options);
|
||||
if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
|
||||
config.password(storage_auth_token);
|
||||
}
|
||||
|
||||
(id, config)
|
||||
});
|
||||
|
||||
// Create task set to query all safekeepers
|
||||
let mut tasks = FuturesUnordered::new();
|
||||
let quorum = sk_configs.len() / 2 + 1;
|
||||
for (id, config) in sk_configs {
|
||||
let timeout = tokio::time::Duration::from_millis(100);
|
||||
let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
|
||||
tasks.push(tokio::spawn(task));
|
||||
}
|
||||
|
||||
// Get a quorum of responses or errors
|
||||
let mut responses = Vec::new();
|
||||
let mut join_errors = Vec::new();
|
||||
let mut task_errors = Vec::new();
|
||||
let mut timeout_errors = Vec::new();
|
||||
while let Some(response) = tasks.next().await {
|
||||
match response {
|
||||
Ok(Ok(Ok(r))) => responses.push(r),
|
||||
Ok(Ok(Err(e))) => task_errors.push(e),
|
||||
Ok(Err(e)) => timeout_errors.push(e),
|
||||
Err(e) => join_errors.push(e),
|
||||
};
|
||||
if responses.len() >= quorum {
|
||||
break;
|
||||
}
|
||||
if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// In case of error, log and fail the check, but don't crash.
|
||||
// We're playing it safe because these errors could be transient
|
||||
// and we don't yet retry. Also being careful here allows us to
|
||||
// be backwards compatible with safekeepers that don't have the
|
||||
// TIMELINE_STATUS API yet.
|
||||
if responses.len() < quorum {
|
||||
error!(
|
||||
"failed sync safekeepers check {:?} {:?} {:?}",
|
||||
join_errors, task_errors, timeout_errors
|
||||
);
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(check_if_synced(responses))
|
||||
}
|
||||
|
||||
// Fast path for sync_safekeepers. If they're already synced we get the lsn
|
||||
// in one roundtrip. If not, we should do a full sync_safekeepers.
|
||||
pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
// Run actual work with new tokio runtime
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
|
||||
|
||||
// Record runtime
|
||||
self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64;
|
||||
result
|
||||
}
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
// and return the reported LSN back to the caller.
|
||||
#[instrument(skip_all)]
|
||||
@@ -371,10 +487,14 @@ impl ComputeNode {
|
||||
// cannot sync safekeepers.
|
||||
let lsn = match spec.mode {
|
||||
ComputeMode::Primary => {
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("checking if safekeepers are synced");
|
||||
let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
|
||||
lsn
|
||||
} else {
|
||||
info!("starting safekeepers syncing");
|
||||
self.sync_safekeepers(pspec.storage_auth_token.clone())
|
||||
.with_context(|| "failed to sync safekeepers")?
|
||||
};
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
lsn
|
||||
}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use compute_api::responses::ComputeStatus;
|
||||
@@ -42,13 +41,14 @@ fn configurator_main_loop(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
pub fn launch_configurator(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let compute = Arc::clone(compute);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
thread::Builder::new()
|
||||
.name("compute-configurator".into())
|
||||
.spawn(move || {
|
||||
configurator_main_loop(&compute);
|
||||
info!("configurator thread is exited");
|
||||
})?)
|
||||
})
|
||||
.expect("cannot launch configurator thread")
|
||||
}
|
||||
|
||||
@@ -13,3 +13,4 @@ pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod spec;
|
||||
pub mod sync_sk;
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
use std::{thread, time};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tracing::{debug, info};
|
||||
@@ -105,10 +104,11 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
thread::Builder::new()
|
||||
.name("compute-monitor".into())
|
||||
.spawn(move || watch_compute_activity(&state))?)
|
||||
.spawn(move || watch_compute_activity(&state))
|
||||
.expect("cannot launch compute monitor thread")
|
||||
}
|
||||
|
||||
98
compute_tools/src/sync_sk.rs
Normal file
98
compute_tools/src/sync_sk.rs
Normal file
@@ -0,0 +1,98 @@
|
||||
// Utils for running sync_safekeepers
|
||||
use anyhow::Result;
|
||||
use tracing::info;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum TimelineStatusResponse {
|
||||
NotFound,
|
||||
Ok(TimelineStatusOkResponse),
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct TimelineStatusOkResponse {
|
||||
flush_lsn: Lsn,
|
||||
commit_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Get a safekeeper's metadata for our timeline. The id is only used for logging
|
||||
pub async fn ping_safekeeper(
|
||||
id: String,
|
||||
config: tokio_postgres::Config,
|
||||
) -> Result<TimelineStatusResponse> {
|
||||
// TODO add retries
|
||||
|
||||
// Connect
|
||||
info!("connecting to {}", id);
|
||||
let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = conn.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
// Query
|
||||
info!("querying {}", id);
|
||||
let result = client.simple_query("TIMELINE_STATUS").await?;
|
||||
|
||||
// Parse result
|
||||
info!("done with {}", id);
|
||||
if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
|
||||
use std::str::FromStr;
|
||||
let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
|
||||
flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
|
||||
commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
|
||||
});
|
||||
Ok(response)
|
||||
} else {
|
||||
// Timeline doesn't exist
|
||||
Ok(TimelineStatusResponse::NotFound)
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a quorum of responses, check if safekeepers are synced at some Lsn
|
||||
pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
|
||||
// Check if all responses are ok
|
||||
let ok_responses: Vec<TimelineStatusOkResponse> = responses
|
||||
.iter()
|
||||
.filter_map(|r| match r {
|
||||
TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
|
||||
_ => None,
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
if ok_responses.len() < responses.len() {
|
||||
info!(
|
||||
"not synced. Only {} out of {} know about this timeline",
|
||||
ok_responses.len(),
|
||||
responses.len()
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
// Get the min and the max of everything
|
||||
let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
|
||||
let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
|
||||
let commit_max = commit.iter().max().unwrap();
|
||||
let commit_min = commit.iter().min().unwrap();
|
||||
let flush_max = flush.iter().max().unwrap();
|
||||
let flush_min = flush.iter().min().unwrap();
|
||||
|
||||
// Check that all values are equal
|
||||
if commit_min != commit_max {
|
||||
info!("not synced. {:?} {:?}", commit_min, commit_max);
|
||||
return None;
|
||||
}
|
||||
if flush_min != flush_max {
|
||||
info!("not synced. {:?} {:?}", flush_min, flush_max);
|
||||
return None;
|
||||
}
|
||||
|
||||
// Check that commit == flush
|
||||
if commit_max != flush_max {
|
||||
info!("not synced. {:?} {:?}", commit_max, flush_max);
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(*commit_max)
|
||||
}
|
||||
@@ -652,8 +652,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
)?;
|
||||
}
|
||||
"start" => {
|
||||
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
|
||||
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
|
||||
// let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
|
||||
// let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
|
||||
let endpoint_id = sub_args
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||
@@ -673,7 +673,10 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
env.safekeepers.iter().map(|sk| sk.id).collect()
|
||||
};
|
||||
|
||||
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
|
||||
|
||||
let auth_token = if matches!(env.pageserver.pg_auth_type, AuthType::NeonJWT) {
|
||||
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
|
||||
@@ -688,63 +691,17 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.copied()
|
||||
.unwrap_or(false);
|
||||
|
||||
if let Some(endpoint) = endpoint {
|
||||
match (&endpoint.mode, hot_standby) {
|
||||
(ComputeMode::Static(_), true) => {
|
||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||
}
|
||||
(ComputeMode::Primary, true) => {
|
||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||
}
|
||||
_ => {}
|
||||
match (&endpoint.mode, hot_standby) {
|
||||
(ComputeMode::Static(_), true) => {
|
||||
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token, safekeepers)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
.map(|s| s.as_str())
|
||||
.unwrap_or(DEFAULT_BRANCH_NAME);
|
||||
let timeline_id = env
|
||||
.get_branch_timeline_id(branch_name, tenant_id)
|
||||
.ok_or_else(|| {
|
||||
anyhow!("Found no timeline id for branch name '{branch_name}'")
|
||||
})?;
|
||||
let lsn = sub_args
|
||||
.get_one::<String>("lsn")
|
||||
.map(|lsn_str| Lsn::from_str(lsn_str))
|
||||
.transpose()
|
||||
.context("Failed to parse Lsn from the request")?;
|
||||
let pg_version = sub_args
|
||||
.get_one::<u32>("pg-version")
|
||||
.copied()
|
||||
.context("Failed to `pg-version` from the argument string")?;
|
||||
|
||||
let mode = match (lsn, hot_standby) {
|
||||
(Some(lsn), false) => ComputeMode::Static(lsn),
|
||||
(None, true) => ComputeMode::Replica,
|
||||
(None, false) => ComputeMode::Primary,
|
||||
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
|
||||
};
|
||||
|
||||
// when used with custom port this results in non obvious behaviour
|
||||
// port is remembered from first start command, i e
|
||||
// start --port X
|
||||
// stop
|
||||
// start <-- will also use port X even without explicit port argument
|
||||
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
|
||||
|
||||
let ep = cplane.new_endpoint(
|
||||
endpoint_id,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
pg_port,
|
||||
http_port,
|
||||
pg_version,
|
||||
mode,
|
||||
)?;
|
||||
ep.start(&auth_token, safekeepers)?;
|
||||
(ComputeMode::Primary, true) => {
|
||||
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token, safekeepers)?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
|
||||
@@ -128,6 +128,20 @@ impl ComputeControlPlane {
|
||||
) -> Result<Arc<Endpoint>> {
|
||||
let pg_port = pg_port.unwrap_or_else(|| self.get_port());
|
||||
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
|
||||
|
||||
if matches!(mode, ComputeMode::Primary) {
|
||||
// this check is not complete, as you could have a concurrent attempt at
|
||||
// creating another primary, both reading the state before checking it here,
|
||||
// but it's better than nothing.
|
||||
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
|
||||
v.tenant_id == tenant_id && v.timeline_id == timeline_id && v.mode == mode
|
||||
});
|
||||
|
||||
if let Some((key, _)) = duplicates.next() {
|
||||
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
|
||||
}
|
||||
}
|
||||
|
||||
let ep = Arc::new(Endpoint {
|
||||
endpoint_id: endpoint_id.to_owned(),
|
||||
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
|
||||
@@ -289,7 +303,7 @@ impl Endpoint {
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.map(|sk| format!("localhost:{}", sk.get_compute_port()))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("neon.safekeepers", &safekeepers);
|
||||
@@ -318,7 +332,7 @@ impl Endpoint {
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|x| x.pg_port.to_string())
|
||||
.map(|x| x.get_compute_port().to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(",");
|
||||
let sk_hosts = vec!["localhost"; self.env.safekeepers.len()].join(",");
|
||||
@@ -463,7 +477,7 @@ impl Endpoint {
|
||||
.iter()
|
||||
.find(|node| node.id == sk_id)
|
||||
.ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.pg_port));
|
||||
safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -137,6 +137,7 @@ impl Default for PageServerConf {
|
||||
pub struct SafekeeperConf {
|
||||
pub id: NodeId,
|
||||
pub pg_port: u16,
|
||||
pub pg_tenant_only_port: Option<u16>,
|
||||
pub http_port: u16,
|
||||
pub sync: bool,
|
||||
pub remote_storage: Option<String>,
|
||||
@@ -149,6 +150,7 @@ impl Default for SafekeeperConf {
|
||||
Self {
|
||||
id: NodeId(0),
|
||||
pg_port: 0,
|
||||
pg_tenant_only_port: None,
|
||||
http_port: 0,
|
||||
sync: true,
|
||||
remote_storage: None,
|
||||
@@ -158,6 +160,14 @@ impl Default for SafekeeperConf {
|
||||
}
|
||||
}
|
||||
|
||||
impl SafekeeperConf {
|
||||
/// Compute is served by port on which only tenant scoped tokens allowed, if
|
||||
/// it is configured.
|
||||
pub fn get_compute_port(&self) -> u16 {
|
||||
self.pg_tenant_only_port.unwrap_or(self.pg_port)
|
||||
}
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
pub fn pg_distrib_dir_raw(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.clone()
|
||||
@@ -169,7 +179,6 @@ impl LocalEnv {
|
||||
match pg_version {
|
||||
14 => Ok(path.join(format!("v{pg_version}"))),
|
||||
15 => Ok(path.join(format!("v{pg_version}"))),
|
||||
16 => Ok(path.join(format!("v{pg_version}"))),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -178,7 +187,6 @@ impl LocalEnv {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -186,7 +194,6 @@ impl LocalEnv {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,45 +120,55 @@ impl SafekeeperNode {
|
||||
let availability_zone = format!("sk-{}", id_string);
|
||||
|
||||
let mut args = vec![
|
||||
"-D",
|
||||
datadir.to_str().with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--id",
|
||||
&id_string,
|
||||
"--listen-pg",
|
||||
&listen_pg,
|
||||
"--listen-http",
|
||||
&listen_http,
|
||||
"--availability-zone",
|
||||
&availability_zone,
|
||||
"-D".to_owned(),
|
||||
datadir
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Datadir path {datadir:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
"--id".to_owned(),
|
||||
id_string,
|
||||
"--listen-pg".to_owned(),
|
||||
listen_pg,
|
||||
"--listen-http".to_owned(),
|
||||
listen_http,
|
||||
"--availability-zone".to_owned(),
|
||||
availability_zone,
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||
}
|
||||
if !self.conf.sync {
|
||||
args.push("--no-sync");
|
||||
args.push("--no-sync".to_owned());
|
||||
}
|
||||
|
||||
let broker_endpoint = format!("{}", self.env.broker.client_url());
|
||||
args.extend(["--broker-endpoint", &broker_endpoint]);
|
||||
args.extend(["--broker-endpoint".to_owned(), broker_endpoint]);
|
||||
|
||||
let mut backup_threads = String::new();
|
||||
if let Some(threads) = self.conf.backup_threads {
|
||||
backup_threads = threads.to_string();
|
||||
args.extend(["--backup-threads", &backup_threads]);
|
||||
args.extend(["--backup-threads".to_owned(), backup_threads]);
|
||||
} else {
|
||||
drop(backup_threads);
|
||||
}
|
||||
|
||||
if let Some(ref remote_storage) = self.conf.remote_storage {
|
||||
args.extend(["--remote-storage", remote_storage]);
|
||||
args.extend(["--remote-storage".to_owned(), remote_storage.clone()]);
|
||||
}
|
||||
|
||||
let key_path = self.env.base_data_dir.join("auth_public_key.pem");
|
||||
if self.conf.auth_enabled {
|
||||
args.extend([
|
||||
"--auth-validation-public-key-path",
|
||||
key_path.to_str().with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?,
|
||||
"--auth-validation-public-key-path".to_owned(),
|
||||
key_path
|
||||
.to_str()
|
||||
.with_context(|| {
|
||||
format!("Key path {key_path:?} cannot be represented as a unicode string")
|
||||
})?
|
||||
.to_owned(),
|
||||
]);
|
||||
}
|
||||
|
||||
|
||||
@@ -30,8 +30,8 @@ or similar, to wake up on shutdown.
|
||||
|
||||
In async Rust, futures can be "cancelled" at any await point, by
|
||||
dropping the Future. For example, `tokio::select!` returns as soon as
|
||||
one of the Futures returns, and drops the others. `tokio::timeout!` is
|
||||
another example. In the Rust ecosystem, some functions are
|
||||
one of the Futures returns, and drops the others. `tokio::time::timeout`
|
||||
is another example. In the Rust ecosystem, some functions are
|
||||
cancellation-safe, meaning they can be safely dropped without
|
||||
side-effects, while others are not. See documentation of
|
||||
`tokio::select!` for examples.
|
||||
@@ -42,9 +42,9 @@ function that you call cannot be assumed to be async
|
||||
cancellation-safe, and must be polled to completion.
|
||||
|
||||
The downside of non-cancellation safe code is that you have to be very
|
||||
careful when using `tokio::select!`, `tokio::timeout!`, and other such
|
||||
functions that can cause a Future to be dropped. They can only be used
|
||||
with functions that are explicitly documented to be cancellation-safe,
|
||||
careful when using `tokio::select!`, `tokio::time::timeout`, and other
|
||||
such functions that can cause a Future to be dropped. They can only be
|
||||
used with functions that are explicitly documented to be cancellation-safe,
|
||||
or you need to spawn a separate task to shield from the cancellation.
|
||||
|
||||
At the entry points to the code, we also take care to poll futures to
|
||||
|
||||
@@ -70,6 +70,7 @@ where
|
||||
pub struct ComputeMetrics {
|
||||
pub wait_for_spec_ms: u64,
|
||||
pub sync_safekeepers_ms: u64,
|
||||
pub sync_sk_check_ms: u64,
|
||||
pub basebackup_ms: u64,
|
||||
pub basebackup_bytes: u64,
|
||||
pub start_postgres_ms: u64,
|
||||
|
||||
@@ -6,6 +6,7 @@ use once_cell::sync::Lazy;
|
||||
use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
|
||||
pub use prometheus::opts;
|
||||
pub use prometheus::register;
|
||||
pub use prometheus::Error;
|
||||
pub use prometheus::{core, default_registry, proto};
|
||||
pub use prometheus::{exponential_buckets, linear_buckets};
|
||||
pub use prometheus::{register_counter_vec, Counter, CounterVec};
|
||||
|
||||
@@ -9,6 +9,7 @@ use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
@@ -76,7 +77,12 @@ pub enum TenantState {
|
||||
/// system is being shut down.
|
||||
///
|
||||
/// Transitions out of this state are possible through `set_broken()`.
|
||||
Stopping,
|
||||
Stopping {
|
||||
// Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
|
||||
// otherwise it will not be skipped during deserialization
|
||||
#[serde(skip)]
|
||||
progress: completion::Barrier,
|
||||
},
|
||||
/// The tenant is recognized by the pageserver, but can no longer be used for
|
||||
/// any operations.
|
||||
///
|
||||
@@ -118,7 +124,7 @@ impl TenantState {
|
||||
// Why is Stopping a Maybe case? Because, during pageserver shutdown,
|
||||
// we set the Stopping state irrespective of whether the tenant
|
||||
// has finished attaching or not.
|
||||
Self::Stopping => Maybe,
|
||||
Self::Stopping { .. } => Maybe,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -928,7 +934,13 @@ mod tests {
|
||||
"Activating",
|
||||
),
|
||||
(line!(), TenantState::Active, "Active"),
|
||||
(line!(), TenantState::Stopping, "Stopping"),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Stopping {
|
||||
progress: utils::completion::Barrier::default(),
|
||||
},
|
||||
"Stopping",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
TenantState::Broken {
|
||||
|
||||
@@ -56,7 +56,7 @@ fn main() -> anyhow::Result<()> {
|
||||
PathBuf::from("pg_install")
|
||||
};
|
||||
|
||||
for pg_version in &["v14", "v15", "v16"] {
|
||||
for pg_version in &["v14", "v15"] {
|
||||
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
|
||||
if pg_install_dir_versioned.is_relative() {
|
||||
let cwd = env::current_dir().context("Failed to get current_dir")?;
|
||||
|
||||
@@ -51,7 +51,6 @@ macro_rules! for_all_postgres_versions {
|
||||
($macro:tt) => {
|
||||
$macro!(v14);
|
||||
$macro!(v15);
|
||||
$macro!(v16);
|
||||
};
|
||||
}
|
||||
|
||||
@@ -93,10 +92,9 @@ pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
||||
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
|
||||
match version {
|
||||
14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
|
||||
15 | 16 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|
||||
15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|
||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
|
||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
|
||||
|
||||
_ => anyhow::bail!("Unknown version {}", version),
|
||||
}
|
||||
}
|
||||
@@ -112,7 +110,6 @@ pub fn generate_wal_segment(
|
||||
match pg_version {
|
||||
14 => v14::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
15 => v15::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
16 => v16::xlog_utils::generate_wal_segment(segno, system_id, lsn),
|
||||
_ => Err(SerializeError::BadInput),
|
||||
}
|
||||
}
|
||||
@@ -126,7 +123,6 @@ pub fn generate_pg_control(
|
||||
match pg_version {
|
||||
14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
16 => v16::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
_ => anyhow::bail!("Unknown version {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -201,7 +197,7 @@ pub fn fsm_logical_to_physical(addr: BlockNumber) -> BlockNumber {
|
||||
|
||||
pub mod waldecoder {
|
||||
|
||||
use crate::{v14, v15, v16};
|
||||
use crate::{v14, v15};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use std::num::NonZeroU32;
|
||||
use thiserror::Error;
|
||||
@@ -263,10 +259,6 @@ pub mod waldecoder {
|
||||
use self::v15::waldecoder_handler::WalStreamDecoderHandler;
|
||||
self.poll_decode_internal()
|
||||
}
|
||||
16 => {
|
||||
use self::v16::waldecoder_handler::WalStreamDecoderHandler;
|
||||
self.poll_decode_internal()
|
||||
}
|
||||
_ => Err(WalDecodeError {
|
||||
msg: format!("Unknown version {}", self.pg_version),
|
||||
lsn: self.lsn,
|
||||
|
||||
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
|
||||
// Multixact utils
|
||||
|
||||
pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
|
||||
((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
|
||||
% pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
|
||||
* pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
|
||||
}
|
||||
|
||||
pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
|
||||
@@ -81,3 +81,41 @@ fn mx_offset_to_member_page(xid: u32) -> u32 {
|
||||
pub fn mx_offset_to_member_segment(xid: u32) -> i32 {
|
||||
(mx_offset_to_member_page(xid) / pg_constants::SLRU_PAGES_PER_SEGMENT) as i32
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_multixid_calc() {
|
||||
// Check that the mx_offset_* functions produce the same values as the
|
||||
// corresponding PostgreSQL C macros (MXOffsetTo*). These test values
|
||||
// were generated by calling the PostgreSQL macros with a little C
|
||||
// program.
|
||||
assert_eq!(mx_offset_to_member_segment(0), 0);
|
||||
assert_eq!(mx_offset_to_member_page(0), 0);
|
||||
assert_eq!(mx_offset_to_flags_offset(0), 0);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(0), 0);
|
||||
assert_eq!(mx_offset_to_member_offset(0), 4);
|
||||
assert_eq!(mx_offset_to_member_segment(1), 0);
|
||||
assert_eq!(mx_offset_to_member_page(1), 0);
|
||||
assert_eq!(mx_offset_to_flags_offset(1), 0);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(1), 8);
|
||||
assert_eq!(mx_offset_to_member_offset(1), 8);
|
||||
assert_eq!(mx_offset_to_member_segment(123456789), 2358);
|
||||
assert_eq!(mx_offset_to_member_page(123456789), 75462);
|
||||
assert_eq!(mx_offset_to_flags_offset(123456789), 4780);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(123456789), 8);
|
||||
assert_eq!(mx_offset_to_member_offset(123456789), 4788);
|
||||
assert_eq!(mx_offset_to_member_segment(u32::MAX - 1), 82040);
|
||||
assert_eq!(mx_offset_to_member_page(u32::MAX - 1), 2625285);
|
||||
assert_eq!(mx_offset_to_flags_offset(u32::MAX - 1), 5160);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX - 1), 16);
|
||||
assert_eq!(mx_offset_to_member_offset(u32::MAX - 1), 5172);
|
||||
assert_eq!(mx_offset_to_member_segment(u32::MAX), 82040);
|
||||
assert_eq!(mx_offset_to_member_page(u32::MAX), 2625285);
|
||||
assert_eq!(mx_offset_to_flags_offset(u32::MAX), 5160);
|
||||
assert_eq!(mx_offset_to_flags_bitshift(u32::MAX), 24);
|
||||
assert_eq!(mx_offset_to_member_offset(u32::MAX), 5176);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
@@ -52,7 +52,6 @@ impl Conf {
|
||||
match self.pg_version {
|
||||
14 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||
15 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||
16 => Ok(path.join(format!("v{}", self.pg_version))),
|
||||
_ => bail!("Unsupported postgres version: {}", self.pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
|
||||
#[derive(Debug)]
|
||||
pub struct FeCloseMessage;
|
||||
|
||||
/// An error occured while parsing or serializing raw stream into Postgres
|
||||
/// An error occurred while parsing or serializing raw stream into Postgres
|
||||
/// messages.
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum ProtocolError {
|
||||
|
||||
@@ -42,6 +42,10 @@ workspace_hack.workspace = true
|
||||
|
||||
const_format.workspace = true
|
||||
|
||||
# to use tokio channels as streams, this is faster to compile than async_stream
|
||||
# why is it only here? no other crate should use it, streams are rarely needed.
|
||||
tokio-stream = { version = "0.1.14" }
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::id::TenantId;
|
||||
/// Algorithm to use. We require EdDSA.
|
||||
const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum Scope {
|
||||
// Provides access to all data for a specific tenant (specified in `struct Claims` below)
|
||||
|
||||
@@ -12,6 +12,13 @@ pub struct Completion(mpsc::Sender<()>);
|
||||
#[derive(Clone)]
|
||||
pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
|
||||
|
||||
impl Default for Barrier {
|
||||
fn default() -> Self {
|
||||
let (_, rx) = channel();
|
||||
rx
|
||||
}
|
||||
}
|
||||
|
||||
impl Barrier {
|
||||
pub async fn wait(self) {
|
||||
self.0.lock().await.recv().await;
|
||||
@@ -24,6 +31,15 @@ impl Barrier {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Barrier {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// we don't use dyn so this is good
|
||||
Arc::ptr_eq(&self.0, &other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for Barrier {}
|
||||
|
||||
/// Create new Guard and Barrier pair.
|
||||
pub fn channel() -> (Completion, Barrier) {
|
||||
let (tx, rx) = mpsc::channel::<()>(1);
|
||||
|
||||
111
libs/utils/src/error.rs
Normal file
111
libs/utils/src/error.rs
Normal file
@@ -0,0 +1,111 @@
|
||||
/// Create a reporter for an error that outputs similar to [`anyhow::Error`] with Display with alternative setting.
|
||||
///
|
||||
/// It can be used with `anyhow::Error` as well.
|
||||
///
|
||||
/// Why would one use this instead of converting to `anyhow::Error` on the spot? Because
|
||||
/// anyhow::Error would also capture a stacktrace on the spot, which you would later discard after
|
||||
/// formatting.
|
||||
///
|
||||
/// ## Usage
|
||||
///
|
||||
/// ```rust
|
||||
/// #[derive(Debug, thiserror::Error)]
|
||||
/// enum MyCoolError {
|
||||
/// #[error("should never happen")]
|
||||
/// Bad(#[source] std::io::Error),
|
||||
/// }
|
||||
///
|
||||
/// # fn failing_call() -> Result<(), MyCoolError> { Err(MyCoolError::Bad(std::io::ErrorKind::PermissionDenied.into())) }
|
||||
///
|
||||
/// # fn main() {
|
||||
/// use utils::error::report_compact_sources;
|
||||
///
|
||||
/// if let Err(e) = failing_call() {
|
||||
/// let e = report_compact_sources(&e);
|
||||
/// assert_eq!(format!("{e}"), "should never happen: permission denied");
|
||||
/// }
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
/// ## TODO
|
||||
///
|
||||
/// When we are able to describe return position impl trait in traits, this should of course be an
|
||||
/// extension trait. Until then avoid boxing with this more ackward interface.
|
||||
pub fn report_compact_sources<E: std::error::Error>(e: &E) -> impl std::fmt::Display + '_ {
|
||||
struct AnyhowDisplayAlternateAlike<'a, E>(&'a E);
|
||||
|
||||
impl<E: std::error::Error> std::fmt::Display for AnyhowDisplayAlternateAlike<'_, E> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{}", self.0)?;
|
||||
|
||||
// why is E a generic parameter here? hope that rustc will see through a default
|
||||
// Error::source implementation and leave the following out if there cannot be any
|
||||
// sources:
|
||||
Sources(self.0.source()).try_for_each(|src| write!(f, ": {}", src))
|
||||
}
|
||||
}
|
||||
|
||||
struct Sources<'a>(Option<&'a (dyn std::error::Error + 'static)>);
|
||||
|
||||
impl<'a> Iterator for Sources<'a> {
|
||||
type Item = &'a (dyn std::error::Error + 'static);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let rem = self.0;
|
||||
|
||||
let next = self.0.and_then(|x| x.source());
|
||||
self.0 = next;
|
||||
rem
|
||||
}
|
||||
}
|
||||
|
||||
AnyhowDisplayAlternateAlike(e)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::report_compact_sources;
|
||||
|
||||
#[test]
|
||||
fn report_compact_sources_examples() {
|
||||
use std::fmt::Write;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
enum EvictionError {
|
||||
#[error("cannot evict a remote layer")]
|
||||
CannotEvictRemoteLayer,
|
||||
#[error("stat failed")]
|
||||
StatFailed(#[source] std::io::Error),
|
||||
#[error("layer was no longer part of LayerMap")]
|
||||
LayerNotFound(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
let examples = [
|
||||
(
|
||||
line!(),
|
||||
EvictionError::CannotEvictRemoteLayer,
|
||||
"cannot evict a remote layer",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
EvictionError::StatFailed(std::io::ErrorKind::PermissionDenied.into()),
|
||||
"stat failed: permission denied",
|
||||
),
|
||||
(
|
||||
line!(),
|
||||
EvictionError::LayerNotFound(anyhow::anyhow!("foobar")),
|
||||
"layer was no longer part of LayerMap: foobar",
|
||||
),
|
||||
];
|
||||
|
||||
let mut s = String::new();
|
||||
|
||||
for (line, example, expected) in examples {
|
||||
s.clear();
|
||||
|
||||
write!(s, "{}", report_compact_sources(&example)).expect("string grows");
|
||||
|
||||
assert_eq!(s, expected, "example on line {line}");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -24,12 +24,29 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
|
||||
Ok(dir.next_entry().await?.is_none())
|
||||
}
|
||||
|
||||
pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
|
||||
if e.kind() == io::ErrorKind::NotFound {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
|
||||
where
|
||||
F: Fn() -> io::Result<()>,
|
||||
{
|
||||
fs_operation().or_else(ignore_not_found)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use std::path::PathBuf;
|
||||
|
||||
use crate::fs_ext::is_directory_empty;
|
||||
|
||||
use super::ignore_absent_files;
|
||||
|
||||
#[test]
|
||||
fn is_empty_dir() {
|
||||
use super::PathExt;
|
||||
@@ -75,4 +92,21 @@ mod test {
|
||||
std::fs::remove_file(&file_path).unwrap();
|
||||
assert!(is_directory_empty(file_path).await.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ignore_absent_files_works() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let dir_path = dir.path();
|
||||
|
||||
let file_path: PathBuf = dir_path.join("testfile");
|
||||
|
||||
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
|
||||
|
||||
let f = std::fs::File::create(&file_path).unwrap();
|
||||
drop(f);
|
||||
|
||||
ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
|
||||
|
||||
assert!(!file_path.exists());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,7 +9,6 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
@@ -148,26 +147,140 @@ impl Drop for RequestCancelled {
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
let mut buffer = vec![];
|
||||
let encoder = TextEncoder::new();
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
let metrics = tokio::task::spawn_blocking(move || {
|
||||
// Currently we take a lot of mutexes while collecting metrics, so it's
|
||||
// better to spawn a blocking task to avoid blocking the event loop.
|
||||
metrics::gather()
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
encoder.encode(&metrics, &mut buffer).unwrap();
|
||||
impl ChannelWriter {
|
||||
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
|
||||
let body = Body::wrap_stream(ReceiverStream::new(rx));
|
||||
|
||||
let mut writer = ChannelWriter::new(128 * 1024, tx);
|
||||
|
||||
let encoder = TextEncoder::new();
|
||||
|
||||
let response = Response::builder()
|
||||
.status(200)
|
||||
.header(CONTENT_TYPE, encoder.format_type())
|
||||
.body(Body::from(buffer))
|
||||
.body(body)
|
||||
.unwrap();
|
||||
|
||||
let span = info_span!("blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _span = span.entered();
|
||||
let metrics = metrics::gather();
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
bytes = writer.flushed_bytes(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("failed to write out /metrics response: {e:#}");
|
||||
// semantics of this error are quite... unclear. we want to error the stream out to
|
||||
// abort the response to somehow notify the client that we failed.
|
||||
//
|
||||
// though, most likely the reason for failure is that the receiver is already gone.
|
||||
drop(
|
||||
writer
|
||||
.tx
|
||||
.blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use std::ffi::OsStr;
|
||||
use std::{fmt, str::FromStr};
|
||||
|
||||
use anyhow::Context;
|
||||
use hex::FromHex;
|
||||
use rand::Rng;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -213,6 +215,18 @@ pub struct TimelineId(Id);
|
||||
|
||||
id_newtype!(TimelineId);
|
||||
|
||||
impl TryFrom<Option<&OsStr>> for TimelineId {
|
||||
type Error = anyhow::Error;
|
||||
|
||||
fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
|
||||
value
|
||||
.and_then(OsStr::to_str)
|
||||
.unwrap_or_default()
|
||||
.parse::<TimelineId>()
|
||||
.with_context(|| format!("Could not parse timeline id from {:?}", value))
|
||||
}
|
||||
}
|
||||
|
||||
/// Neon Tenant Id represents identifiar of a particular tenant.
|
||||
/// Is used for distinguishing requests and data belonging to different users.
|
||||
///
|
||||
|
||||
@@ -63,6 +63,9 @@ pub mod rate_limit;
|
||||
/// Simple once-barrier and a guard which keeps barrier awaiting.
|
||||
pub mod completion;
|
||||
|
||||
/// Reporting utilities
|
||||
pub mod error;
|
||||
|
||||
mod failpoint_macro_helpers {
|
||||
|
||||
/// use with fail::cfg("$name", "return(2000)")
|
||||
|
||||
@@ -164,9 +164,7 @@ fn tracing_subscriber_configured() -> bool {
|
||||
tracing::dispatcher::get_default(|d| {
|
||||
// it is possible that this closure will not be invoked, but the current implementation
|
||||
// always invokes it
|
||||
noop_configured = d
|
||||
.downcast_ref::<tracing::subscriber::NoSubscriber>()
|
||||
.is_some();
|
||||
noop_configured = d.is::<tracing::subscriber::NoSubscriber>();
|
||||
});
|
||||
|
||||
!noop_configured
|
||||
|
||||
@@ -35,6 +35,8 @@ humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
nix.workspace = true
|
||||
# hack to get the number of worker threads tokio uses
|
||||
num_cpus = { version = "1.15" }
|
||||
num-traits.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
@@ -82,6 +84,7 @@ strum_macros.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tempfile.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -13,6 +13,7 @@ clap = { workspace = true, features = ["string"] }
|
||||
git-version.workspace = true
|
||||
pageserver = { path = ".." }
|
||||
postgres_ffi.workspace = true
|
||||
tokio.workspace = true
|
||||
utils.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -95,7 +95,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
}
|
||||
|
||||
// Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
|
||||
fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
async fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
let file = FileBlockReader::new(VirtualFile::open(path)?);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
@@ -129,7 +129,7 @@ fn get_holes(path: &Path, max_holes: usize) -> Result<Vec<Hole>> {
|
||||
Ok(holes)
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
let storage_path = &cmd.path;
|
||||
let max_holes = cmd.max_holes.unwrap_or(DEFAULT_MAX_HOLES);
|
||||
|
||||
@@ -160,7 +160,7 @@ pub(crate) fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if layer_file.is_delta {
|
||||
layer_file.holes = get_holes(&layer.path(), max_holes)?;
|
||||
layer_file.holes = get_holes(&layer.path(), max_holes).await?;
|
||||
n_deltas += 1;
|
||||
}
|
||||
layers.push(layer_file);
|
||||
|
||||
@@ -43,8 +43,7 @@ pub(crate) enum LayerCmd {
|
||||
},
|
||||
}
|
||||
|
||||
fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::blob_io::BlobCursor;
|
||||
async fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
use pageserver::tenant::block_io::BlockReader;
|
||||
|
||||
let path = path.as_ref();
|
||||
@@ -78,7 +77,7 @@ fn read_delta_file(path: impl AsRef<Path>) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
match cmd {
|
||||
LayerCmd::List { path } => {
|
||||
for tenant in fs::read_dir(path.join("tenants"))? {
|
||||
@@ -153,7 +152,7 @@ pub(crate) fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
);
|
||||
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path())?;
|
||||
read_delta_file(layer.path()).await?;
|
||||
} else {
|
||||
anyhow::bail!("not supported yet :(");
|
||||
}
|
||||
|
||||
@@ -72,12 +72,13 @@ struct AnalyzeLayerMapCmd {
|
||||
max_holes: Option<usize>,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let cli = CliOpts::parse();
|
||||
|
||||
match cli.command {
|
||||
Commands::Layer(cmd) => {
|
||||
layers::main(&cmd)?;
|
||||
layers::main(&cmd).await?;
|
||||
}
|
||||
Commands::Metadata(cmd) => {
|
||||
handle_metadata(&cmd)?;
|
||||
@@ -86,7 +87,7 @@ fn main() -> anyhow::Result<()> {
|
||||
draw_timeline_dir::main()?;
|
||||
}
|
||||
Commands::AnalyzeLayerMap(cmd) => {
|
||||
layer_map_analyzer::main(&cmd)?;
|
||||
layer_map_analyzer::main(&cmd).await?;
|
||||
}
|
||||
Commands::PrintLayerFile(cmd) => {
|
||||
if let Err(e) = read_pg_control_file(&cmd.path) {
|
||||
@@ -94,7 +95,7 @@ fn main() -> anyhow::Result<()> {
|
||||
"Failed to read input file as a pg control one: {e:#}\n\
|
||||
Attempting to read it as layer file"
|
||||
);
|
||||
print_layerfile(&cmd.path)?;
|
||||
print_layerfile(&cmd.path).await?;
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -113,12 +114,12 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
page_cache::init(100);
|
||||
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
|
||||
dump_layerfile_from_path(path, true, &ctx)
|
||||
dump_layerfile_from_path(path, true, &ctx).await
|
||||
}
|
||||
|
||||
fn handle_metadata(
|
||||
|
||||
@@ -19,12 +19,6 @@ use tokio::io;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tracing::*;
|
||||
|
||||
/// NB: This relies on a modified version of tokio_tar that does *not* write the
|
||||
/// end-of-archive marker (1024 zero bytes), when the Builder struct is dropped
|
||||
/// without explicitly calling 'finish' or 'into_inner'!
|
||||
///
|
||||
/// See https://github.com/neondatabase/tokio-tar/pull/1
|
||||
///
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::context::RequestContext;
|
||||
|
||||
@@ -396,8 +396,8 @@ fn start_pageserver(
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
|
||||
let init_sizes_done = tokio::select! {
|
||||
_ = &mut init_sizes_done => {
|
||||
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
|
||||
Ok(_) => {
|
||||
let now = std::time::Instant::now();
|
||||
tracing::info!(
|
||||
from_init_done_millis = (now - init_done).as_millis(),
|
||||
@@ -406,7 +406,7 @@ fn start_pageserver(
|
||||
);
|
||||
None
|
||||
}
|
||||
_ = tokio::time::sleep(timeout) => {
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
"Initial logical size timeout elapsed; starting background jobs"
|
||||
|
||||
@@ -33,7 +33,8 @@ use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
TIMELINE_UNINIT_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
pub mod defaults {
|
||||
@@ -601,6 +602,17 @@ impl PageServerConf {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn timeline_delete_mark_file_path(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> PathBuf {
|
||||
path_with_suffix_extension(
|
||||
self.timeline_path(&tenant_id, &timeline_id),
|
||||
TIMELINE_DELETE_MARK_SUFFIX,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn traces_path(&self) -> PathBuf {
|
||||
self.workdir.join("traces")
|
||||
}
|
||||
@@ -655,7 +667,6 @@ impl PageServerConf {
|
||||
match pg_version {
|
||||
14 => Ok(path.join(format!("v{pg_version}"))),
|
||||
15 => Ok(path.join(format!("v{pg_version}"))),
|
||||
16 => Ok(path.join(format!("v{pg_version}"))),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -664,7 +675,6 @@ impl PageServerConf {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("bin")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
@@ -672,7 +682,6 @@ impl PageServerConf {
|
||||
match pg_version {
|
||||
14 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
15 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
16 => Ok(self.pg_distrib_dir(pg_version)?.join("lib")),
|
||||
_ => bail!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,7 +60,7 @@ use utils::serde_percent::Percent;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{self, storage_layer::PersistentLayer, Timeline},
|
||||
tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -166,11 +166,11 @@ async fn disk_usage_eviction_task(
|
||||
.await;
|
||||
|
||||
let sleep_until = start + task_config.period;
|
||||
tokio::select! {
|
||||
_ = tokio::time::sleep_until(sleep_until) => {},
|
||||
_ = cancel.cancelled() => {
|
||||
break
|
||||
}
|
||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -390,13 +390,22 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
match result {
|
||||
Some(Ok(true)) => {
|
||||
Some(Ok(())) => {
|
||||
usage_assumed.add_available_bytes(layer.file_size());
|
||||
}
|
||||
Some(Ok(false)) => {
|
||||
// this is:
|
||||
// - Replacement::{NotFound, Unexpected}
|
||||
// - it cannot be is_remote_layer, filtered already
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
|
||||
}
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_)
|
||||
| e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(%layer, "failed to evict layer: {e}");
|
||||
evictions_failed.file_sizes += layer.file_size();
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
@@ -404,10 +413,6 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
assert!(cancel.is_cancelled());
|
||||
return;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
// we really shouldn't be getting this, precondition failure
|
||||
error!("failed to evict layer: {:#}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -540,12 +545,12 @@ async fn collect_eviction_candidates(
|
||||
// We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
|
||||
// That's what's typically used by the various background loops.
|
||||
//
|
||||
// The default can be overriden with a fixed value in the tenant conf.
|
||||
// The default can be overridden with a fixed value in the tenant conf.
|
||||
// A default override can be put in the default tenant conf in the pageserver.toml.
|
||||
let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
|
||||
debug!(
|
||||
tenant_id=%tenant.tenant_id(),
|
||||
overriden_size=s,
|
||||
overridden_size=s,
|
||||
"using overridden min resident size for tenant"
|
||||
);
|
||||
s
|
||||
|
||||
@@ -994,31 +994,29 @@ async fn timeline_gc_handler(
|
||||
// Run compaction immediately on given timeline.
|
||||
async fn timeline_compact_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
|
||||
.await
|
||||
.context("spawn compaction task")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let result: anyhow::Result<()> = result_receiver
|
||||
.await
|
||||
.context("receive compaction result")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
result.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
async fn timeline_checkpoint_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
@@ -1031,13 +1029,13 @@ async fn timeline_checkpoint_handler(
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
timeline
|
||||
.compact(&ctx)
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_checkpoint", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
||||
.instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -109,6 +109,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
|
||||
/// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
|
||||
pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
||||
@@ -123,15 +125,30 @@ pub fn is_temporary(path: &Path) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
|
||||
match path.file_name() {
|
||||
Some(name) => name
|
||||
.to_string_lossy()
|
||||
.ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
|
||||
Some(name) => name.to_string_lossy().ends_with(suffix),
|
||||
None => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_uninit_mark(path: &Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
pub fn is_delete_mark(path: &Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
|
||||
}
|
||||
|
||||
fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
|
||||
if let Some(e) = e.io_error() {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
|
||||
/// blocking.
|
||||
///
|
||||
|
||||
@@ -6,7 +6,6 @@ use metrics::{
|
||||
IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::models::TenantState;
|
||||
use strum::VariantNames;
|
||||
use strum_macros::{EnumVariantNames, IntoStaticStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -74,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
// Buckets for background operations like compaction, GC, size calculation
|
||||
const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
|
||||
|
||||
pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_storage_operations_seconds_global",
|
||||
"Time spent on storage operations",
|
||||
@@ -84,18 +83,17 @@ pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_read_num_fs_layers",
|
||||
"Number of persistent layers accessed for processing a read request, including those in the cache",
|
||||
&["tenant_id", "timeline_id"],
|
||||
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value (reconstruct a page from deltas)",
|
||||
@@ -104,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"Number of cache hits from materialized page cache without redo",
|
||||
@@ -112,17 +110,16 @@ pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_getpage_get_reconstruct_data_seconds",
|
||||
"Time spent in get_reconstruct_value_data",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"Number of cache hits from materialized page cache",
|
||||
@@ -246,11 +243,10 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
|
||||
},
|
||||
});
|
||||
|
||||
static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wait_lsn_seconds",
|
||||
"Time spent waiting for WAL to arrive",
|
||||
&["tenant_id", "timeline_id"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
@@ -284,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_layers_total",
|
||||
"Total on-demand downloaded layers"
|
||||
@@ -292,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_remote_ondemand_downloaded_bytes_total",
|
||||
"Total bytes of layers on-demand downloaded",
|
||||
@@ -309,16 +305,29 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define current logical size metric")
|
||||
});
|
||||
|
||||
pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_states_count",
|
||||
"Count of tenants per state",
|
||||
&["tenant_id", "state"]
|
||||
&["state"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
/// A set of broken tenants.
|
||||
///
|
||||
/// These are expected to be so rare that a set is fine. Set as in a new timeseries per each broken
|
||||
/// tenant.
|
||||
pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_broken_tenants_count",
|
||||
"Set of broken tenants",
|
||||
&["tenant_id"]
|
||||
)
|
||||
.expect("Failed to register pageserver_tenant_states_count metric")
|
||||
});
|
||||
|
||||
pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_tenant_synthetic_cached_size_bytes",
|
||||
"Synthetic size of each tenant in bytes",
|
||||
@@ -376,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_unexpected_ondemand_downloads_count",
|
||||
"Number of unexpected on-demand downloads. \
|
||||
@@ -499,23 +508,31 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
30.000, // 30000 ms
|
||||
];
|
||||
|
||||
const STORAGE_IO_TIME_OPERATIONS: &[&str] = &[
|
||||
"open", "close", "read", "write", "seek", "fsync", "gc", "metadata",
|
||||
];
|
||||
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
|
||||
pub static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
/// Tracks time taken by fs operations near VirtualFile.
|
||||
///
|
||||
/// Operations:
|
||||
/// - open ([`std::fs::OpenOptions::open`])
|
||||
/// - close (dropping [`std::fs::File`])
|
||||
/// - close-by-replace (close by replacement algorithm)
|
||||
/// - read (`read_at`)
|
||||
/// - write (`write_at`)
|
||||
/// - seek (modify internal position or file length query)
|
||||
/// - fsync ([`std::fs::File::sync_all`])
|
||||
/// - metadata ([`std::fs::File::metadata`])
|
||||
pub(crate) static STORAGE_IO_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_io_operations_seconds",
|
||||
"Time spent in IO operations",
|
||||
&["operation", "tenant_id", "timeline_id"],
|
||||
&["operation"],
|
||||
STORAGE_IO_TIME_BUCKETS.into()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
const STORAGE_IO_SIZE_OPERATIONS: &[&str] = &["read", "write"];
|
||||
|
||||
// Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
|
||||
pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_io_operations_bytes_total",
|
||||
"Total amount of bytes read/written in IO operations",
|
||||
@@ -605,7 +622,7 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
|
||||
at a given instant. It gives you a better idea of the queue depth \
|
||||
than plotting the gauge directly, since operations may complete faster \
|
||||
than the sampling interval.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind"],
|
||||
&["file_kind", "op_kind"],
|
||||
// The calls_unfinished gauge is an integer gauge, hence we have integer buckets.
|
||||
vec![0.0, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0, 15.0, 20.0, 40.0, 60.0, 80.0, 100.0, 500.0],
|
||||
)
|
||||
@@ -662,18 +679,18 @@ impl RemoteOpFileKind {
|
||||
}
|
||||
}
|
||||
|
||||
pub static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_operation_seconds",
|
||||
"Time spent on remote storage operations. \
|
||||
Grouped by tenant, timeline, operation_kind and status. \
|
||||
Does not account for time spent waiting in remote timeline client's queues.",
|
||||
&["tenant_id", "timeline_id", "file_kind", "op_kind", "status"]
|
||||
&["file_kind", "op_kind", "status"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_task_events",
|
||||
"Number of task start/stop/fail events.",
|
||||
@@ -682,7 +699,7 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_period_overrun_count",
|
||||
"Incremented whenever warn_when_period_overrun() logs a warning.",
|
||||
@@ -693,7 +710,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new
|
||||
|
||||
// walreceiver metrics
|
||||
|
||||
pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_walreceiver_started_connections_total",
|
||||
"Number of started walreceiver connections"
|
||||
@@ -701,7 +718,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
register_int_gauge!(
|
||||
"pageserver_walreceiver_active_managers",
|
||||
"Number of active walreceiver managers"
|
||||
@@ -709,7 +726,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_walreceiver_switches_total",
|
||||
"Number of walreceiver manager change_connection calls",
|
||||
@@ -718,7 +735,7 @@ pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_walreceiver_broker_updates_total",
|
||||
"Number of received broker updates in walreceiver"
|
||||
@@ -726,7 +743,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_walreceiver_candidates_events_total",
|
||||
"Number of walreceiver candidate events",
|
||||
@@ -735,10 +752,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
|
||||
|
||||
pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
|
||||
Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
|
||||
|
||||
// Metrics collected on WAL redo operations
|
||||
@@ -785,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
|
||||
};
|
||||
}
|
||||
|
||||
pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
"Time spent on WAL redo",
|
||||
@@ -794,7 +811,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the Postgres WAL redo process",
|
||||
@@ -803,7 +820,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_records_histogram",
|
||||
"Histogram of number of records replayed per redo in the Postgres WAL redo process",
|
||||
@@ -812,7 +829,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_bytes_histogram",
|
||||
"Histogram of number of records replayed per redo sent to Postgres",
|
||||
@@ -821,7 +838,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
|
||||
pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_replayed_wal_records_total",
|
||||
"Number of WAL records replayed in WAL redo process"
|
||||
@@ -897,7 +915,6 @@ impl StorageTimeMetrics {
|
||||
pub struct TimelineMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
pub get_reconstruct_data_time_histo: Histogram,
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
@@ -906,9 +923,7 @@ pub struct TimelineMetrics {
|
||||
pub load_layer_map_histo: StorageTimeMetrics,
|
||||
pub garbage_collect_histo: StorageTimeMetrics,
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub read_num_fs_layers: Histogram,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub num_persistent_files_created: IntCounter,
|
||||
@@ -925,9 +940,6 @@ impl TimelineMetrics {
|
||||
) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let flush_time_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
||||
let compact_time_histo =
|
||||
@@ -948,9 +960,6 @@ impl TimelineMetrics {
|
||||
let last_record_gauge = LAST_RECORD_LSN
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let wait_lsn_time_histo = WAIT_LSN_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
@@ -966,16 +975,12 @@ impl TimelineMetrics {
|
||||
let evictions = EVICTIONS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let read_num_fs_layers = READ_NUM_FS_LAYERS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
get_reconstruct_data_time_histo,
|
||||
flush_time_histo,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
@@ -984,7 +989,6 @@ impl TimelineMetrics {
|
||||
garbage_collect_histo,
|
||||
load_layer_map_histo,
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
resident_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
num_persistent_files_created,
|
||||
@@ -993,7 +997,6 @@ impl TimelineMetrics {
|
||||
evictions_with_low_residence_duration: std::sync::RwLock::new(
|
||||
evictions_with_low_residence_duration,
|
||||
),
|
||||
read_num_fs_layers,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1002,15 +1005,12 @@ impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = READ_NUM_FS_LAYERS.remove_label_values(&[tenant_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -1022,9 +1022,6 @@ impl Drop for TimelineMetrics {
|
||||
let _ =
|
||||
STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
for op in STORAGE_IO_TIME_OPERATIONS {
|
||||
let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in STORAGE_IO_SIZE_OPERATIONS {
|
||||
let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
@@ -1039,9 +1036,7 @@ impl Drop for TimelineMetrics {
|
||||
pub fn remove_tenant_metrics(tenant_id: &TenantId) {
|
||||
let tid = tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
for state in TenantState::VARIANTS {
|
||||
let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
|
||||
}
|
||||
// we leave the BROKEN_TENANTS_SET entry if any
|
||||
}
|
||||
|
||||
use futures::Future;
|
||||
@@ -1056,9 +1051,7 @@ pub struct RemoteTimelineClientMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
|
||||
remote_operation_time: Mutex<HashMap<(&'static str, &'static str, &'static str), Histogram>>,
|
||||
calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
|
||||
calls_started_hist: Mutex<HashMap<(&'static str, &'static str), Histogram>>,
|
||||
bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
|
||||
}
|
||||
@@ -1068,14 +1061,13 @@ impl RemoteTimelineClientMetrics {
|
||||
RemoteTimelineClientMetrics {
|
||||
tenant_id: tenant_id.to_string(),
|
||||
timeline_id: timeline_id.to_string(),
|
||||
remote_operation_time: Mutex::new(HashMap::default()),
|
||||
calls_unfinished_gauge: Mutex::new(HashMap::default()),
|
||||
calls_started_hist: Mutex::new(HashMap::default()),
|
||||
bytes_started_counter: Mutex::new(HashMap::default()),
|
||||
bytes_finished_counter: Mutex::new(HashMap::default()),
|
||||
remote_physical_size_gauge: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remote_physical_size_gauge(&self) -> UIntGauge {
|
||||
let mut guard = self.remote_physical_size_gauge.lock().unwrap();
|
||||
guard
|
||||
@@ -1089,26 +1081,17 @@ impl RemoteTimelineClientMetrics {
|
||||
})
|
||||
.clone()
|
||||
}
|
||||
|
||||
pub fn remote_operation_time(
|
||||
&self,
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
status: &'static str,
|
||||
) -> Histogram {
|
||||
let mut guard = self.remote_operation_time.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str(), status);
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
key.2,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
REMOTE_OPERATION_TIME
|
||||
.get_metric_with_label_values(&[key.0, key.1, key.2])
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn calls_unfinished_gauge(
|
||||
@@ -1136,19 +1119,10 @@ impl RemoteTimelineClientMetrics {
|
||||
file_kind: &RemoteOpFileKind,
|
||||
op_kind: &RemoteOpKind,
|
||||
) -> Histogram {
|
||||
let mut guard = self.calls_started_hist.lock().unwrap();
|
||||
let key = (file_kind.as_str(), op_kind.as_str());
|
||||
let metric = guard.entry(key).or_insert_with(move || {
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[
|
||||
&self.tenant_id.to_string(),
|
||||
&self.timeline_id.to_string(),
|
||||
key.0,
|
||||
key.1,
|
||||
])
|
||||
.unwrap()
|
||||
});
|
||||
metric.clone()
|
||||
REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST
|
||||
.get_metric_with_label_values(&[key.0, key.1])
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn bytes_started_counter(
|
||||
@@ -1328,15 +1302,10 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
remote_physical_size_gauge,
|
||||
remote_operation_time,
|
||||
calls_unfinished_gauge,
|
||||
calls_started_hist,
|
||||
bytes_started_counter,
|
||||
bytes_finished_counter,
|
||||
} = self;
|
||||
for ((a, b, c), _) in remote_operation_time.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_OPERATION_TIME.remove_label_values(&[tenant_id, timeline_id, a, b, c]);
|
||||
}
|
||||
for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
|
||||
tenant_id,
|
||||
@@ -1345,14 +1314,6 @@ impl Drop for RemoteTimelineClientMetrics {
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in calls_started_hist.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST.remove_label_values(&[
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
a,
|
||||
b,
|
||||
]);
|
||||
}
|
||||
for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
|
||||
let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
|
||||
tenant_id,
|
||||
@@ -1434,15 +1395,51 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// We want to alert on this metric increasing.
|
||||
// Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
|
||||
assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
|
||||
UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
|
||||
// order:
|
||||
// - global metrics reside in a Lazy<PageserverMetrics>
|
||||
// - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
|
||||
// - could move the statics into TimelineMetrics::new()?
|
||||
|
||||
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
|
||||
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
|
||||
// counters
|
||||
[
|
||||
&MATERIALIZED_PAGE_CACHE_HIT,
|
||||
&MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
&UNEXPECTED_ONDEMAND_DOWNLOADS,
|
||||
&WALRECEIVER_STARTED_CONNECTIONS,
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
&WALRECEIVER_CANDIDATES_ADDED,
|
||||
&WALRECEIVER_CANDIDATES_REMOVED,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// Python tests need these.
|
||||
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
|
||||
MATERIALIZED_PAGE_CACHE_HIT.get();
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
.for_each(|c| {
|
||||
Lazy::force(c);
|
||||
});
|
||||
|
||||
// gauges
|
||||
WALRECEIVER_ACTIVE_MANAGERS.get();
|
||||
|
||||
// histograms
|
||||
[
|
||||
&READ_NUM_FS_LAYERS,
|
||||
&RECONSTRUCT_TIME,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
&WAL_REDO_WAIT_TIME,
|
||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||
&WAL_REDO_BYTES_HISTOGRAM,
|
||||
]
|
||||
.into_iter()
|
||||
.for_each(|h| {
|
||||
Lazy::force(h);
|
||||
});
|
||||
}
|
||||
|
||||
@@ -130,11 +130,25 @@ pub static WALRECEIVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("background op worker")
|
||||
// if you change the number of worker threads please change the constant below
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create background op runtime")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(|| {
|
||||
// force init and thus panics
|
||||
let _ = BACKGROUND_RUNTIME.handle();
|
||||
// replicates tokio-1.28.1::loom::sys::num_cpus which is not available publicly
|
||||
// tokio would had already panicked for parsing errors or NotUnicode
|
||||
//
|
||||
// this will be wrong if any of the runtimes gets their worker threads configured to something
|
||||
// else, but that has not been needed in a long time.
|
||||
std::env::var("TOKIO_WORKER_THREADS")
|
||||
.map(|s| s.parse::<usize>().unwrap())
|
||||
.unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
@@ -511,17 +525,13 @@ pub async fn shutdown_tasks(
|
||||
warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
|
||||
}
|
||||
}
|
||||
let join_handle = tokio::select! {
|
||||
biased;
|
||||
_ = &mut join_handle => { None },
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(1)) => {
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
Some(join_handle)
|
||||
}
|
||||
};
|
||||
if let Some(join_handle) = join_handle {
|
||||
if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
|
||||
.await
|
||||
.is_err()
|
||||
{
|
||||
// allow some time to elapse before logging to cut down the number of log
|
||||
// lines.
|
||||
info!("waiting for {} to shut down", task.name);
|
||||
// we never handled this return value, but:
|
||||
// - we don't deschedule which would lead to is_cancelled
|
||||
// - panics are already logged (is_panicked)
|
||||
@@ -549,7 +559,7 @@ pub fn current_task_id() -> Option<PageserverTaskId> {
|
||||
pub async fn shutdown_watcher() {
|
||||
let token = SHUTDOWN_TOKEN
|
||||
.try_with(|t| t.clone())
|
||||
.expect("shutdown_requested() called in an unexpected task or thread");
|
||||
.expect("shutdown_watcher() called in an unexpected task or thread");
|
||||
|
||||
token.cancelled().await;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -16,29 +16,19 @@ use crate::tenant::block_io::{BlockCursor, BlockReader};
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
|
||||
/// For reading
|
||||
pub trait BlobCursor {
|
||||
impl<R> BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
/// Read a blob into a new buffer.
|
||||
fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
pub fn read_blob(&mut self, offset: u64) -> Result<Vec<u8>, std::io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_blob_into_buf(offset, &mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Read blob into the given buffer. Any previous contents in the buffer
|
||||
/// are overwritten.
|
||||
fn read_blob_into_buf(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error>;
|
||||
}
|
||||
|
||||
impl<R> BlobCursor for BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
fn read_blob_into_buf(
|
||||
pub fn read_blob_into_buf(
|
||||
&mut self,
|
||||
offset: u64,
|
||||
dstbuf: &mut Vec<u8>,
|
||||
|
||||
574
pageserver/src/tenant/delete.rs
Normal file
574
pageserver/src/tenant/delete.rs
Normal file
@@ -0,0 +1,574 @@
|
||||
use std::{
|
||||
ops::{Deref, DerefMut},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use pageserver_api::models::TimelineState;
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
|
||||
use utils::{
|
||||
crashsafe, fs_ext,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{remote_timeline_client, DeleteTimelineError},
|
||||
InitializationOrder,
|
||||
};
|
||||
|
||||
use super::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, Tenant, Timeline,
|
||||
};
|
||||
|
||||
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
// Stop the walreceiver first.
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
|
||||
if let Some(walreceiver) = maybe_started_walreceiver {
|
||||
walreceiver.stop().await;
|
||||
}
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
|
||||
// Prevent new uploads from starting.
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
let res = remote_client.stop();
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
Err(e) => match e {
|
||||
remote_timeline_client::StopError::QueueUninitialized => {
|
||||
// This case shouldn't happen currently because the
|
||||
// load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
|
||||
// That is, before we declare the Tenant as Active.
|
||||
// But we only allow calls to delete_timeline on Active tenants.
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Stop & wait for the remaining timeline tasks, including upload tasks.
|
||||
// NB: This and other delete_timeline calls do not run as a task_mgr task,
|
||||
// so, they are not affected by this shutdown_tasks() call.
|
||||
info!("waiting for timeline tasks to shutdown");
|
||||
task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-deleted-at"
|
||||
))?
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Mark timeline as deleted in S3 so we won't pick it up next time
|
||||
/// during attach or pageserver restart.
|
||||
/// See comment in persist_index_part_with_deleted_flag.
|
||||
async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
|
||||
if let Some(remote_client) = timeline.remote_client.as_ref() {
|
||||
match remote_client.persist_index_part_with_deleted_flag().await {
|
||||
// If we (now, or already) marked it successfully as deleted, we can proceed
|
||||
Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
|
||||
// Bail out otherwise
|
||||
//
|
||||
// AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
|
||||
// two tasks from performing the deletion at the same time. The first task
|
||||
// that starts deletion should run it to completion.
|
||||
Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
|
||||
| Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
|
||||
return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
|
||||
// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
|
||||
// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
|
||||
// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
|
||||
// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
|
||||
// So we can just remove the mark file.
|
||||
async fn create_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
fail::fail_point!("timeline-delete-before-delete-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-delete-mark"
|
||||
))?
|
||||
});
|
||||
let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
|
||||
|
||||
// Note: we're ok to replace existing file.
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Grab the layer_removal_cs lock, and actually perform the deletion.
|
||||
///
|
||||
/// This lock prevents prevents GC or compaction from running at the same time.
|
||||
/// The GC task doesn't register itself with the timeline it's operating on,
|
||||
/// so it might still be running even though we called `shutdown_tasks`.
|
||||
///
|
||||
/// Note that there are still other race conditions between
|
||||
/// GC, compaction and timeline deletion. See
|
||||
/// <https://github.com/neondatabase/neon/issues/2671>
|
||||
///
|
||||
/// No timeout here, GC & Compaction should be responsive to the
|
||||
/// `TimelineState::Stopping` change.
|
||||
async fn delete_local_layer_files(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline: &Timeline,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("waiting for layer_removal_cs.lock()");
|
||||
let layer_removal_guard = timeline.layer_removal_cs.lock().await;
|
||||
info!("got layer_removal_cs.lock(), deleting layer files");
|
||||
|
||||
// NB: storage_sync upload tasks that reference these layers have been cancelled
|
||||
// by the caller.
|
||||
|
||||
let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||
});
|
||||
|
||||
// NB: This need not be atomic because the deleted flag in the IndexPart
|
||||
// will be observed during tenant/timeline load. The deletion will be resumed there.
|
||||
//
|
||||
// For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
|
||||
//
|
||||
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
|
||||
// This can happen if we're called a second time, e.g.,
|
||||
// because of a previous failure/cancellation at/after
|
||||
// failpoint timeline-delete-after-rm.
|
||||
//
|
||||
// It can also happen if we race with tenant detach, because,
|
||||
// it doesn't grab the layer_removal_cs lock.
|
||||
//
|
||||
// For now, log and continue.
|
||||
// warn! level is technically not appropriate for the
|
||||
// first case because we should expect retries to happen.
|
||||
// But the error is so rare, it seems better to get attention if it happens.
|
||||
//
|
||||
// Note that metadata removal is skipped, this is not technically needed,
|
||||
// but allows to reuse timeline loading code during resumed deletion.
|
||||
// (we always expect that metadata is in place when timeline is being loaded)
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
let mut counter = 0;
|
||||
|
||||
// Timeline directory may not exist if we failed to delete mark file and request was retried.
|
||||
if !local_timeline_directory.exists() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
|
||||
|
||||
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
counter += 1;
|
||||
if counter == 2 {
|
||||
fail::fail_point!("timeline-delete-during-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let entry = entry?;
|
||||
if entry.path() == metadata_path {
|
||||
debug!("found metadata, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.path() == local_timeline_directory {
|
||||
// Keeping directory because metedata file is still there
|
||||
debug!("found timeline dir itself, skipping");
|
||||
continue;
|
||||
}
|
||||
|
||||
let metadata = match entry.metadata() {
|
||||
Ok(metadata) => metadata,
|
||||
Err(e) => {
|
||||
if crate::is_walkdir_io_not_found(&e) {
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
path=?entry.path().display(),
|
||||
"got not found err while removing timeline dir, proceeding anyway"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
anyhow::bail!(e);
|
||||
}
|
||||
};
|
||||
|
||||
let r = if metadata.is_dir() {
|
||||
// There shouldnt be any directories inside timeline dir as of current layout.
|
||||
tokio::fs::remove_dir(entry.path()).await
|
||||
} else {
|
||||
tokio::fs::remove_file(entry.path()).await
|
||||
};
|
||||
|
||||
if let Err(e) = r {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
warn!(
|
||||
timeline_dir=?local_timeline_directory,
|
||||
path=?entry.path().display(),
|
||||
"got not found err while removing timeline dir, proceeding anyway"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
anyhow::bail!(anyhow::anyhow!(
|
||||
"Failed to remove: {}. Error: {e}",
|
||||
entry.path().display()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
info!("finished deleting layer files, releasing layer_removal_cs.lock()");
|
||||
drop(layer_removal_guard);
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes remote layers and an index file after them.
|
||||
async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
|
||||
if let Some(remote_client) = &timeline.remote_client {
|
||||
remote_client.delete_all().await.context("delete_all")?
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This function removs remaining traces of a timeline on disk.
|
||||
// Namely: metadata file, timeline directory, delete mark.
|
||||
// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
|
||||
// delete mark should be present because it is the last step during deletion.
|
||||
// (nothing can fail after its deletion)
|
||||
async fn cleanup_remaining_timeline_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove local metadata
|
||||
tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("remove metadata")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-after-rm-metadata"
|
||||
))?
|
||||
});
|
||||
|
||||
// Remove timeline dir
|
||||
tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.context("timeline dir")?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm-dir", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
|
||||
});
|
||||
|
||||
// Remove delete mark
|
||||
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
|
||||
.await
|
||||
.context("remove delete mark")
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn remove_timeline_from_tenant(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = tenant.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
|
||||
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
|
||||
// We already deleted the layer files, so it's probably best to panic.
|
||||
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
|
||||
if children_exist {
|
||||
panic!("Timeline grew children while we removed layer files");
|
||||
}
|
||||
|
||||
timelines
|
||||
.remove(&timeline_id)
|
||||
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
|
||||
|
||||
drop(timelines);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
/// 1. Set deleted_at in remote index part.
|
||||
/// 2. Create local mark file.
|
||||
/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
|
||||
/// 4. Delete remote layers
|
||||
/// 5. Delete index part
|
||||
/// 6. Delete meta, timeline directory
|
||||
/// 7. Delete mark file
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
|
||||
/// and we possibly neeed to continue deletion of remote files.
|
||||
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
|
||||
/// index but still have local metadata, timeline directory and delete mark.
|
||||
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTimelineFlow {
|
||||
#[default]
|
||||
NotStarted,
|
||||
InProgress,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl DeleteTimelineFlow {
|
||||
// These steps are run in the context of management api request handler.
|
||||
// Long running steps are continued to run in the background.
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
|
||||
pub async fn run(
|
||||
tenant: &Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
stop_tasks(&timeline).await?;
|
||||
|
||||
set_deleted_in_remote_index(&timeline).await?;
|
||||
|
||||
create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-schedule"
|
||||
))?
|
||||
});
|
||||
|
||||
Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
|
||||
Self::InProgress { .. } => { /* We're in a retry */ }
|
||||
Self::NotStarted => { /* Fresh start */ }
|
||||
}
|
||||
|
||||
*self = Self::InProgress;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Shortcut to create Timeline in stopping state and spawn deletion task.
|
||||
pub async fn resume_deletion(
|
||||
tenant: Arc<Tenant>,
|
||||
timeline_id: TimelineId,
|
||||
local_metadata: &TimelineMetadata,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
|
||||
// RemoteTimelineClient is the only functioning part.
|
||||
let timeline = tenant
|
||||
.create_timeline_struct(
|
||||
timeline_id,
|
||||
local_metadata,
|
||||
None, // Ancestor is not needed for deletion.
|
||||
remote_client,
|
||||
init_order,
|
||||
// Important. We dont pass ancestor above because it can be missing.
|
||||
// Thus we need to skip the validation here.
|
||||
CreateTimelineCause::Delete,
|
||||
)
|
||||
.context("create_timeline_struct")?;
|
||||
|
||||
let mut guard = DeletionGuard(
|
||||
Arc::clone(&timeline.delete_progress)
|
||||
.try_lock_owned()
|
||||
.expect("cannot happen because we're the only owner"),
|
||||
);
|
||||
|
||||
// We meed to do this because when console retries delete request we shouldnt answer with 404
|
||||
// because 404 means successful deletion.
|
||||
{
|
||||
let mut locked = tenant.timelines.lock().unwrap();
|
||||
locked.insert(timeline_id, Arc::clone(&timeline));
|
||||
}
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
// Note that delete mark can be missing on resume
|
||||
// because we create delete mark after we set deleted_at in the index part.
|
||||
create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
|
||||
|
||||
Self::schedule_background(guard, tenant.conf, tenant, timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn cleanup_remaining_timeline_fs_traces(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
|
||||
}
|
||||
|
||||
fn prepare(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
|
||||
// Note the interaction between this guard and deletion guard.
|
||||
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
|
||||
// This is important because when you take into account `remove_timeline_from_tenant`
|
||||
// we remove timeline from memory when we still hold the deletion guard.
|
||||
// So here when timeline deletion is finished timeline wont be present in timelines map at all
|
||||
// which makes the following sequence impossible:
|
||||
// T1: get preempted right before the try_lock on `Timeline::delete_progress`
|
||||
// T2: do a full deletion, acquire and drop `Timeline::delete_progress`
|
||||
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
|
||||
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => t,
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
};
|
||||
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
let children: Vec<TimelineId> = timelines
|
||||
.iter()
|
||||
.filter_map(|(id, entry)| {
|
||||
if entry.get_ancestor_timeline_id() == Some(timeline_id) {
|
||||
Some(*id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !children.is_empty() {
|
||||
return Err(DeleteTimelineError::HasChildren(children));
|
||||
}
|
||||
|
||||
// Note that using try_lock here is important to avoid a deadlock.
|
||||
// Here we take lock on timelines and then the deletion guard.
|
||||
// At the end of the operation we're holding the guard and need to lock timelines map
|
||||
// to remove the timeline from it.
|
||||
// Always if you have two locks that are taken in different order this can result in a deadlock.
|
||||
let delete_lock_guard = DeletionGuard(
|
||||
Arc::clone(&timeline.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
|
||||
);
|
||||
|
||||
timeline.set_state(TimelineState::Stopping);
|
||||
|
||||
Ok((Arc::clone(timeline), delete_lock_guard))
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: DeletionGuard,
|
||||
conf: &'static PageServerConf,
|
||||
tenant: Arc<Tenant>,
|
||||
timeline: Arc<Timeline>,
|
||||
) {
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
"timeline_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
|
||||
error!("Error: {err:#}");
|
||||
timeline.set_broken(format!("{err:#}"))
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span =
|
||||
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: DeletionGuard,
|
||||
conf: &PageServerConf,
|
||||
tenant: &Tenant,
|
||||
timeline: &Timeline,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
|
||||
|
||||
delete_remote_layers_and_index(timeline).await?;
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
|
||||
|
||||
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
|
||||
|
||||
*guard.0 = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
|
||||
|
||||
impl Deref for DeletionGuard {
|
||||
type Target = DeleteTimelineFlow;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for DeletionGuard {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
@@ -328,7 +328,7 @@ fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockCursor;
|
||||
use rand::{seq::SliceRandom, thread_rng, RngCore};
|
||||
use std::fs;
|
||||
|
||||
@@ -626,17 +626,17 @@ impl LayerMap {
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
|
||||
println!("open_layer:");
|
||||
if let Some(open_layer) = &self.open_layer {
|
||||
open_layer.dump(verbose, ctx)?;
|
||||
open_layer.dump(verbose, ctx).await?;
|
||||
}
|
||||
|
||||
println!("frozen_layers:");
|
||||
for frozen_layer in self.frozen_layers.iter() {
|
||||
frozen_layer.dump(verbose, ctx)?;
|
||||
frozen_layer.dump(verbose, ctx).await?;
|
||||
}
|
||||
|
||||
println!("historic_layers:");
|
||||
|
||||
@@ -9,10 +9,11 @@
|
||||
//! [`remote_timeline_client`]: super::remote_timeline_client
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::io::{self, Write};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
use tracing::info_span;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::{
|
||||
@@ -267,24 +268,24 @@ pub fn save_metadata(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum LoadMetadataError {
|
||||
#[error(transparent)]
|
||||
Read(#[from] io::Error),
|
||||
|
||||
#[error(transparent)]
|
||||
Decode(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub fn load_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
timeline_id: &TimelineId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
) -> Result<TimelineMetadata, LoadMetadataError> {
|
||||
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
|
||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})?;
|
||||
TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse metadata bytes from path {}",
|
||||
metadata_path.display()
|
||||
)
|
||||
})
|
||||
let metadata_bytes = std::fs::read(metadata_path)?;
|
||||
|
||||
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -26,6 +26,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
|
||||
use utils::fs_ext::PathExt;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTimelineFlow;
|
||||
|
||||
/// The tenants known to the pageserver.
|
||||
/// The enum variants are used to distinguish the different states that the pageserver can be in.
|
||||
enum TenantsMap {
|
||||
@@ -233,11 +235,17 @@ pub fn schedule_local_tenant_processing(
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument]
|
||||
#[instrument(skip_all)]
|
||||
pub async fn shutdown_all_tenants() {
|
||||
shutdown_all_tenants0(&TENANTS).await
|
||||
}
|
||||
|
||||
async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock<TenantsMap>) {
|
||||
use utils::completion;
|
||||
|
||||
// Prevent new tenants from being created.
|
||||
let tenants_to_shut_down = {
|
||||
let mut m = TENANTS.write().await;
|
||||
let mut m = tenants.write().await;
|
||||
match &mut *m {
|
||||
TenantsMap::Initializing => {
|
||||
*m = TenantsMap::ShuttingDown(HashMap::default());
|
||||
@@ -262,14 +270,41 @@ pub async fn shutdown_all_tenants() {
|
||||
for (tenant_id, tenant) in tenants_to_shut_down {
|
||||
join_set.spawn(
|
||||
async move {
|
||||
let freeze_and_flush = true;
|
||||
// ordering shouldn't matter for this, either we store true right away or never
|
||||
let ordering = std::sync::atomic::Ordering::Relaxed;
|
||||
let joined_other = std::sync::atomic::AtomicBool::new(false);
|
||||
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
Ok(()) => debug!("tenant successfully stopped"),
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
warn!("tenant was already shutting down")
|
||||
let mut shutdown = std::pin::pin!(async {
|
||||
let freeze_and_flush = true;
|
||||
|
||||
let res = {
|
||||
let (_guard, shutdown_progress) = completion::channel();
|
||||
tenant.shutdown(shutdown_progress, freeze_and_flush).await
|
||||
};
|
||||
|
||||
if let Err(other_progress) = res {
|
||||
// join the another shutdown in progress
|
||||
joined_other.store(true, ordering);
|
||||
other_progress.wait().await;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// in practice we might not have a lot time to go, since systemd is going to
|
||||
// SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out
|
||||
// a warning.
|
||||
let warning = std::time::Duration::from_secs(5);
|
||||
let mut warning = std::pin::pin!(tokio::time::sleep(warning));
|
||||
|
||||
tokio::select! {
|
||||
_ = &mut shutdown => {},
|
||||
_ = &mut warning => {
|
||||
let joined_other = joined_other.load(ordering);
|
||||
warn!(%joined_other, "waiting for the shutdown to complete");
|
||||
shutdown.await;
|
||||
}
|
||||
};
|
||||
|
||||
debug!("tenant successfully stopped");
|
||||
}
|
||||
.instrument(info_span!("shutdown", %tenant_id)),
|
||||
);
|
||||
@@ -388,12 +423,10 @@ pub enum DeleteTimelineError {
|
||||
pub async fn delete_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
let tenant = get_tenant(tenant_id, true).await?;
|
||||
tenant
|
||||
.prepare_and_schedule_delete_timeline(timeline_id, ctx)
|
||||
.await?;
|
||||
DeleteTimelineFlow::run(&tenant, timeline_id).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -413,6 +446,15 @@ pub async fn detach_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
detach_tenant0(conf, &TENANTS, tenant_id, detach_ignored).await
|
||||
}
|
||||
|
||||
async fn detach_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
detach_ignored: bool,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let local_files_cleanup_operation = |tenant_id_to_clean| async move {
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
|
||||
@@ -425,7 +467,8 @@ pub async fn detach_tenant(
|
||||
};
|
||||
|
||||
let removal_result =
|
||||
remove_tenant_from_memory(tenant_id, local_files_cleanup_operation(tenant_id)).await;
|
||||
remove_tenant_from_memory(tenants, tenant_id, local_files_cleanup_operation(tenant_id))
|
||||
.await;
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
@@ -472,7 +515,15 @@ pub async fn ignore_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
remove_tenant_from_memory(tenant_id, async {
|
||||
ignore_tenant0(conf, &TENANTS, tenant_id).await
|
||||
}
|
||||
|
||||
async fn ignore_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
remove_tenant_from_memory(tenants, tenant_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
.await
|
||||
@@ -597,18 +648,21 @@ where
|
||||
/// If the cleanup fails, tenant will stay in memory in [`TenantState::Broken`] state, and another removal
|
||||
/// operation would be needed to remove it.
|
||||
async fn remove_tenant_from_memory<V, F>(
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
tenant_cleanup: F,
|
||||
) -> Result<V, TenantStateError>
|
||||
where
|
||||
F: std::future::Future<Output = anyhow::Result<V>>,
|
||||
{
|
||||
use utils::completion;
|
||||
|
||||
// It's important to keep the tenant in memory after the final cleanup, to avoid cleanup races.
|
||||
// The exclusive lock here ensures we don't miss the tenant state updates before trying another removal.
|
||||
// tenant-wde cleanup operations may take some time (removing the entire tenant directory), we want to
|
||||
// avoid holding the lock for the entire process.
|
||||
let tenant = {
|
||||
TENANTS
|
||||
tenants
|
||||
.write()
|
||||
.await
|
||||
.get(&tenant_id)
|
||||
@@ -616,14 +670,20 @@ where
|
||||
.ok_or(TenantStateError::NotFound(tenant_id))?
|
||||
};
|
||||
|
||||
// allow pageserver shutdown to await for our completion
|
||||
let (_guard, progress) = completion::channel();
|
||||
|
||||
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
|
||||
let freeze_and_flush = false;
|
||||
|
||||
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
|
||||
// that we can continue safely to cleanup.
|
||||
match tenant.shutdown(freeze_and_flush).await {
|
||||
match tenant.shutdown(progress, freeze_and_flush).await {
|
||||
Ok(()) => {}
|
||||
Err(super::ShutdownError::AlreadyStopping) => {
|
||||
return Err(TenantStateError::IsStopping(tenant_id))
|
||||
Err(_other) => {
|
||||
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
|
||||
// wait for it but return an error right away because these are distinct requests.
|
||||
return Err(TenantStateError::IsStopping(tenant_id));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -632,14 +692,14 @@ where
|
||||
.with_context(|| format!("Failed to run cleanup for tenant {tenant_id}"))
|
||||
{
|
||||
Ok(hook_value) => {
|
||||
let mut tenants_accessor = TENANTS.write().await;
|
||||
let mut tenants_accessor = tenants.write().await;
|
||||
if tenants_accessor.remove(&tenant_id).is_none() {
|
||||
warn!("Tenant {tenant_id} got removed from memory before operation finished");
|
||||
}
|
||||
Ok(hook_value)
|
||||
}
|
||||
Err(e) => {
|
||||
let tenants_accessor = TENANTS.read().await;
|
||||
let tenants_accessor = tenants.read().await;
|
||||
match tenants_accessor.get(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant.set_broken(e.to_string()).await;
|
||||
@@ -708,51 +768,108 @@ pub async fn immediate_gc(
|
||||
Ok(wait_task_done)
|
||||
}
|
||||
|
||||
pub async fn immediate_compact(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
|
||||
let guard = TENANTS.read().await;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::{info_span, Instrument};
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
use super::{super::harness::TenantHarness, TenantsMap};
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn shutdown_joins_remove_tenant_from_memory() {
|
||||
// the test is a bit ugly with the lockstep together with spawned tasks. the aim is to make
|
||||
// sure `shutdown_all_tenants0` per-tenant processing joins in any active
|
||||
// remove_tenant_from_memory calls, which is enforced by making the operation last until
|
||||
// we've ran `shutdown_all_tenants0` for a long time.
|
||||
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::Compaction,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!(
|
||||
"timeline_compact_handler compaction run for tenant {tenant_id} timeline {timeline_id}"
|
||||
),
|
||||
false,
|
||||
async move {
|
||||
let result = timeline
|
||||
.compact(&ctx)
|
||||
.instrument(info_span!("manual_compact", %tenant_id, %timeline_id))
|
||||
.await;
|
||||
let (t, _ctx) = TenantHarness::create("shutdown_joins_detach")
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
|
||||
match task_done.send(result) {
|
||||
Ok(_) => (),
|
||||
Err(result) => error!("failed to send compaction result: {result:?}"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
// harness loads it to active, which is forced and nothing is running on the tenant
|
||||
|
||||
// drop the guard until after we've spawned the task so that timeline shutdown will wait for the task
|
||||
drop(guard);
|
||||
let id = t.tenant_id();
|
||||
|
||||
Ok(wait_task_done)
|
||||
// tenant harness configures the logging and we cannot escape it
|
||||
let _e = info_span!("testing", tenant_id = %id).entered();
|
||||
|
||||
let tenants = HashMap::from([(id, t.clone())]);
|
||||
let tenants = Arc::new(tokio::sync::RwLock::new(TenantsMap::Open(tenants)));
|
||||
|
||||
let (until_cleanup_completed, can_complete_cleanup) = utils::completion::channel();
|
||||
let (until_cleanup_started, cleanup_started) = utils::completion::channel();
|
||||
|
||||
// start a "detaching operation", which will take a while, until can_complete_cleanup
|
||||
let cleanup_task = {
|
||||
let jh = tokio::spawn({
|
||||
let tenants = tenants.clone();
|
||||
async move {
|
||||
let cleanup = async move {
|
||||
drop(until_cleanup_started);
|
||||
can_complete_cleanup.wait().await;
|
||||
anyhow::Ok(())
|
||||
};
|
||||
super::remove_tenant_from_memory(&tenants, id, cleanup).await
|
||||
}
|
||||
.instrument(info_span!("foobar", tenant_id = %id))
|
||||
});
|
||||
|
||||
// now the long cleanup should be in place, with the stopping state
|
||||
cleanup_started.wait().await;
|
||||
jh
|
||||
};
|
||||
|
||||
let mut cleanup_progress = std::pin::pin!(t
|
||||
.shutdown(utils::completion::Barrier::default(), false)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.wait());
|
||||
|
||||
let mut shutdown_task = {
|
||||
let (until_shutdown_started, shutdown_started) = utils::completion::channel();
|
||||
|
||||
let shutdown_task = tokio::spawn(async move {
|
||||
drop(until_shutdown_started);
|
||||
super::shutdown_all_tenants0(&tenants).await;
|
||||
});
|
||||
|
||||
shutdown_started.wait().await;
|
||||
shutdown_task
|
||||
};
|
||||
|
||||
// if the joining in is removed from shutdown_all_tenants0, the shutdown_task should always
|
||||
// get to complete within timeout and fail the test. it is expected to continue awaiting
|
||||
// until completion or SIGKILL during normal shutdown.
|
||||
//
|
||||
// the timeout is long to cover anything that shutdown_task could be doing, but it is
|
||||
// handled instantly because we use tokio's time pausing in this test. 100s is much more than
|
||||
// what we get from systemd on shutdown (10s).
|
||||
let long_time = std::time::Duration::from_secs(100);
|
||||
tokio::select! {
|
||||
_ = &mut shutdown_task => unreachable!("shutdown must continue, until_cleanup_completed is not dropped"),
|
||||
_ = &mut cleanup_progress => unreachable!("cleanup progress must continue, until_cleanup_completed is not dropped"),
|
||||
_ = tokio::time::sleep(long_time) => {},
|
||||
}
|
||||
|
||||
// allow the remove_tenant_from_memory and thus eventually the shutdown to continue
|
||||
drop(until_cleanup_completed);
|
||||
|
||||
let (je, ()) = tokio::join!(shutdown_task, cleanup_progress);
|
||||
je.expect("Tenant::shutdown shutdown not have panicked");
|
||||
cleanup_task
|
||||
.await
|
||||
.expect("no panicking")
|
||||
.expect("remove_tenant_from_memory failed");
|
||||
|
||||
futures::future::poll_immediate(
|
||||
t.shutdown(utils::completion::Barrier::default(), false)
|
||||
.await
|
||||
.unwrap_err()
|
||||
.wait(),
|
||||
)
|
||||
.await
|
||||
.expect("the stopping progress must still be complete");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -514,7 +514,7 @@ impl RemoteTimelineClient {
|
||||
/// updated metadata.
|
||||
///
|
||||
/// The upload will be added to the queue immediately, but it
|
||||
/// won't be performed until all previosuly scheduled layer file
|
||||
/// won't be performed until all previously scheduled layer file
|
||||
/// upload operations have completed successfully. This is to
|
||||
/// ensure that when the index file claims that layers X, Y and Z
|
||||
/// exist in remote storage, they really do. To wait for the upload
|
||||
@@ -625,7 +625,7 @@ impl RemoteTimelineClient {
|
||||
/// Note: This schedules an index file upload before the deletions. The
|
||||
/// deletion won't actually be performed, until any previously scheduled
|
||||
/// upload operations, and the index file upload, have completed
|
||||
/// succesfully.
|
||||
/// successfully.
|
||||
pub fn schedule_layer_file_deletion(
|
||||
self: &Arc<Self>,
|
||||
names: &[LayerFileName],
|
||||
@@ -827,7 +827,7 @@ impl RemoteTimelineClient {
|
||||
)
|
||||
};
|
||||
|
||||
receiver.changed().await?;
|
||||
receiver.changed().await.context("upload queue shut down")?;
|
||||
|
||||
// Do not delete index part yet, it is needed for possible retry. If we remove it first
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
@@ -855,11 +855,23 @@ impl RemoteTimelineClient {
|
||||
self.storage_impl.delete_objects(&remaining).await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-before-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-before-index-delete"
|
||||
))?
|
||||
});
|
||||
|
||||
let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
|
||||
|
||||
debug!("deleting index part");
|
||||
self.storage_impl.delete(&index_file_path).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-after-index-delete", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: timeline-delete-after-index-delete"
|
||||
))?
|
||||
});
|
||||
|
||||
info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
|
||||
|
||||
Ok(())
|
||||
@@ -1105,7 +1117,7 @@ impl RemoteTimelineClient {
|
||||
debug!("remote task {} completed successfully", task.op);
|
||||
}
|
||||
|
||||
// The task has completed succesfully. Remove it from the in-progress list.
|
||||
// The task has completed successfully. Remove it from the in-progress list.
|
||||
{
|
||||
let mut upload_queue_guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = match upload_queue_guard.deref_mut() {
|
||||
|
||||
@@ -338,7 +338,8 @@ impl LayerAccessStats {
|
||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||
/// timeline names, because those are known in the context of which the layers
|
||||
/// are used in (timeline).
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -368,7 +369,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -377,7 +378,7 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
|
||||
}
|
||||
|
||||
/// Returned by [`PersistentLayer::iter`]
|
||||
@@ -442,6 +443,10 @@ pub trait PersistentLayer: Layer + AsLayerDesc {
|
||||
None
|
||||
}
|
||||
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
None
|
||||
}
|
||||
|
||||
fn is_remote_layer(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
@@ -31,7 +31,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -51,6 +51,7 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -222,9 +223,10 @@ impl std::fmt::Debug for DeltaLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -299,7 +301,7 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -414,6 +416,10 @@ impl AsLayerDesc for DeltaLayer {
|
||||
}
|
||||
|
||||
impl PersistentLayer for DeltaLayer {
|
||||
fn downcast_delta_layer(self: Arc<Self>) -> Option<std::sync::Arc<DeltaLayer>> {
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, KEY_SIZE};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::blob_io::{BlobWriter, WriteBlobWriter};
|
||||
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
|
||||
use crate::tenant::storage_layer::{
|
||||
@@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use hex;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -117,7 +117,7 @@ pub struct ImageLayer {
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: RwLock<ImageLayerInner>,
|
||||
inner: OnceCell<ImageLayerInner>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayer {
|
||||
@@ -134,30 +134,27 @@ impl std::fmt::Debug for ImageLayer {
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If false, the 'index' has not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for ImageLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("ImageLayerInner")
|
||||
.field("loaded", &self.loaded)
|
||||
.field("index_start_blk", &self.index_start_blk)
|
||||
.field("index_root_blk", &self.index_root_blk)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
@@ -174,7 +171,7 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
let inner = self.load(LayerAccessKind::Dump, ctx)?;
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let file = &inner.file;
|
||||
let tree_reader =
|
||||
DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
@@ -189,7 +186,7 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
@@ -202,7 +199,7 @@ impl Layer for ImageLayer {
|
||||
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
|
||||
let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
|
||||
@@ -321,52 +318,26 @@ impl ImageLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<RwLockReadGuard<ImageLayerInner>> {
|
||||
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
loop {
|
||||
// Quick exit if already loaded
|
||||
let inner = self.inner.read().unwrap();
|
||||
if inner.loaded {
|
||||
if let Some(inner) = self.inner.get() {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
// Need to open the file and load the metadata. Upgrade our lock to
|
||||
// a write lock. (Or rather, release and re-lock in write mode.)
|
||||
drop(inner);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(&mut inner).with_context(|| {
|
||||
format!("Failed to load image layer {}", self.path().display())
|
||||
})?
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
|
||||
// We now have the file open and loaded. There's no function to do
|
||||
// that in the std library RwLock, so we have to release and re-lock
|
||||
// in read mode. (To be precise, the lock guard was moved in the
|
||||
// above call to `load_inner`, so it's already been released). And
|
||||
// while we do that, another thread could unload again, so we have
|
||||
// to re-check and retry if that happens.
|
||||
drop(inner);
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
|
||||
}
|
||||
}
|
||||
|
||||
fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
|
||||
fn load_inner(&self) -> Result<ImageLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.file.is_none() {
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
inner.file = Some(FileBlockReader::new(file));
|
||||
}
|
||||
let file = inner.file.as_mut().unwrap();
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
@@ -394,10 +365,11 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
inner.index_start_blk = actual_summary.index_start_blk;
|
||||
inner.index_root_blk = actual_summary.index_root_blk;
|
||||
inner.loaded = true;
|
||||
Ok(())
|
||||
Ok(ImageLayerInner {
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
file,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
@@ -421,12 +393,7 @@ impl ImageLayer {
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: filename.lsn,
|
||||
access_stats,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
inner: OnceCell::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -453,12 +420,7 @@ impl ImageLayer {
|
||||
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
|
||||
lsn: summary.lsn,
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
file: None,
|
||||
loaded: false,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
inner: OnceCell::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -619,12 +581,7 @@ impl ImageLayerWriterInner {
|
||||
desc,
|
||||
lsn: self.lsn,
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
}),
|
||||
inner: OnceCell::new(),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::blob_io::{BlobCursor, BlobWriter};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::BlockReader;
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::storage_layer::{ValueReconstructResult, ValueReconstructState};
|
||||
@@ -110,6 +110,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
@@ -132,7 +133,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
@@ -183,7 +184,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
@@ -65,8 +65,9 @@ impl std::fmt::Debug for RemoteLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for RemoteLayer {
|
||||
fn get_value_reconstruct_data(
|
||||
async fn get_value_reconstruct_data(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
@@ -77,7 +78,7 @@ impl Layer for RemoteLayer {
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
async fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
|
||||
println!(
|
||||
"----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
|
||||
self.desc.tenant_id,
|
||||
|
||||
@@ -111,7 +111,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
Duration::from_secs(10)
|
||||
} else {
|
||||
// Run compaction
|
||||
if let Err(e) = tenant.compaction_iteration(&ctx).await {
|
||||
if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
|
||||
error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
|
||||
wait_duration
|
||||
} else {
|
||||
@@ -122,12 +122,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "compaction");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -196,12 +196,12 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
warn_when_period_overrun(started_at.elapsed(), period, "gc");
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -263,9 +263,9 @@ pub(crate) async fn random_init_delay(
|
||||
rng.gen_range(Duration::ZERO..=period)
|
||||
};
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => Err(Cancelled),
|
||||
_ = tokio::time::sleep(d) => Ok(()),
|
||||
match tokio::time::timeout(d, cancel.cancelled()).await {
|
||||
Ok(_) => Err(Cancelled),
|
||||
Err(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ use tracing::*;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -86,6 +86,7 @@ use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::config::TenantConf;
|
||||
use super::delete::DeleteTimelineFlow;
|
||||
use super::remote_timeline_client::index::IndexPart;
|
||||
use super::remote_timeline_client::RemoteTimelineClient;
|
||||
use super::storage_layer::{
|
||||
@@ -237,11 +238,10 @@ pub struct Timeline {
|
||||
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||
/// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
|
||||
/// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
|
||||
/// This is an `Arc<Mutex>` lock because we need an owned
|
||||
/// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
|
||||
///
|
||||
/// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
|
||||
/// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
|
||||
pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
|
||||
|
||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||
@@ -283,7 +283,7 @@ pub struct Timeline {
|
||||
|
||||
/// Prevent two tasks from deleting the timeline at the same time. If held, the
|
||||
/// timeline is being deleted. If 'true', the timeline has already been deleted.
|
||||
pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
|
||||
pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
|
||||
|
||||
eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
|
||||
|
||||
@@ -334,7 +334,7 @@ pub struct GcInfo {
|
||||
#[derive(thiserror::Error)]
|
||||
pub enum PageReconstructError {
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
|
||||
Other(#[from] anyhow::Error),
|
||||
|
||||
/// The operation would require downloading a layer that is missing locally.
|
||||
NeedsDownload(TenantTimelineId, LayerFileName),
|
||||
@@ -475,7 +475,7 @@ impl Timeline {
|
||||
img: cached_page_img,
|
||||
};
|
||||
|
||||
let timer = self.metrics.get_reconstruct_data_time_histo.start_timer();
|
||||
let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
|
||||
self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
timer.stop_and_record();
|
||||
@@ -555,7 +555,7 @@ impl Timeline {
|
||||
"wait_lsn cannot be called in WAL receiver"
|
||||
);
|
||||
|
||||
let _timer = self.metrics.wait_lsn_time_histo.start_timer();
|
||||
let _timer = crate::metrics::WAIT_LSN_TIME.start_timer();
|
||||
|
||||
match self
|
||||
.last_record_lsn
|
||||
@@ -611,9 +611,46 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers.
|
||||
pub async fn compact(self: &Arc<Self>, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
pub async fn compact(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
const ROUNDS: usize = 2;
|
||||
|
||||
static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
|
||||
once_cell::sync::Lazy::new(|| {
|
||||
let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
|
||||
let permits = usize::max(
|
||||
1,
|
||||
// while a lot of the work is done on spawn_blocking, we still do
|
||||
// repartitioning in the async context. this should give leave us some workers
|
||||
// unblocked to be blocked on other work, hopefully easing any outside visible
|
||||
// effects of restarts.
|
||||
//
|
||||
// 6/8 is a guess; previously we ran with unlimited 8 and more from
|
||||
// spawn_blocking.
|
||||
(total_threads * 3).checked_div(4).unwrap_or(0),
|
||||
);
|
||||
assert_ne!(permits, 0, "we will not be adding in permits later");
|
||||
assert!(
|
||||
permits < total_threads,
|
||||
"need threads avail for shorter work"
|
||||
);
|
||||
tokio::sync::Semaphore::new(permits)
|
||||
});
|
||||
|
||||
// this wait probably never needs any "long time spent" logging, because we already nag if
|
||||
// compaction task goes over it's period (20s) which is quite often in production.
|
||||
let _permit = tokio::select! {
|
||||
permit = CONCURRENT_COMPACTIONS.acquire() => {
|
||||
permit
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
|
||||
let last_record_lsn = self.get_last_record_lsn();
|
||||
|
||||
// Last record Lsn could be zero in case the timeline was just created
|
||||
@@ -671,11 +708,9 @@ impl Timeline {
|
||||
|
||||
let mut failed = 0;
|
||||
|
||||
let mut cancelled = pin!(task_mgr::shutdown_watcher());
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
|
||||
_ = cancel.cancelled() => anyhow::bail!("Cancelled while downloading remote layers"),
|
||||
res = downloads.next() => {
|
||||
match res {
|
||||
Some(Ok(())) => {},
|
||||
@@ -890,7 +925,7 @@ impl Timeline {
|
||||
new_state,
|
||||
TimelineState::Stopping | TimelineState::Broken { .. }
|
||||
) {
|
||||
// drop the copmletion guard, if any; it might be holding off the completion
|
||||
// drop the completion guard, if any; it might be holding off the completion
|
||||
// forever needlessly
|
||||
self.initial_logical_size_attempt
|
||||
.lock()
|
||||
@@ -1011,11 +1046,11 @@ impl Timeline {
|
||||
.evict_layer_batch(remote_client, &[local_layer], cancel)
|
||||
.await?;
|
||||
assert_eq!(results.len(), 1);
|
||||
let result: Option<anyhow::Result<bool>> = results.into_iter().next().unwrap();
|
||||
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
|
||||
match result {
|
||||
None => anyhow::bail!("task_mgr shutdown requested"),
|
||||
Some(Ok(b)) => Ok(Some(b)),
|
||||
Some(Err(e)) => Err(e),
|
||||
Some(Ok(())) => Ok(Some(true)),
|
||||
Some(Err(e)) => Err(anyhow::Error::new(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1024,12 +1059,12 @@ impl Timeline {
|
||||
/// GenericRemoteStorage reference is required as a (witness)[witness_article] for "remote storage is configured."
|
||||
///
|
||||
/// [witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html
|
||||
pub async fn evict_layers(
|
||||
pub(crate) async fn evict_layers(
|
||||
&self,
|
||||
_: &GenericRemoteStorage,
|
||||
layers_to_evict: &[Arc<dyn PersistentLayer>],
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
|
||||
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
||||
let remote_client = self.remote_client.clone().expect(
|
||||
"GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient",
|
||||
);
|
||||
@@ -1064,7 +1099,7 @@ impl Timeline {
|
||||
remote_client: &Arc<RemoteTimelineClient>,
|
||||
layers_to_evict: &[Arc<dyn PersistentLayer>],
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<Vec<Option<anyhow::Result<bool>>>> {
|
||||
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
|
||||
// ensure that the layers have finished uploading
|
||||
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
|
||||
remote_client
|
||||
@@ -1110,11 +1145,9 @@ impl Timeline {
|
||||
_layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
|
||||
local_layer: &Arc<dyn PersistentLayer>,
|
||||
layer_mgr: &mut LayerManager,
|
||||
) -> anyhow::Result<bool> {
|
||||
) -> Result<(), EvictionError> {
|
||||
if local_layer.is_remote_layer() {
|
||||
// TODO(issue #3851): consider returning an err here instead of false,
|
||||
// which is the same out the match later
|
||||
return Ok(false);
|
||||
return Err(EvictionError::CannotEvictRemoteLayer);
|
||||
}
|
||||
|
||||
let layer_file_size = local_layer.file_size();
|
||||
@@ -1123,13 +1156,22 @@ impl Timeline {
|
||||
.local_path()
|
||||
.expect("local layer should have a local path")
|
||||
.metadata()
|
||||
.context("get local layer file stat")?
|
||||
// when the eviction fails because we have already deleted the layer in compaction for
|
||||
// example, a NotFound error bubbles up from here.
|
||||
.map_err(|e| {
|
||||
if e.kind() == std::io::ErrorKind::NotFound {
|
||||
EvictionError::FileNotFound
|
||||
} else {
|
||||
EvictionError::StatFailed(e)
|
||||
}
|
||||
})?
|
||||
.modified()
|
||||
.context("get mtime of layer file")?;
|
||||
.map_err(EvictionError::StatFailed)?;
|
||||
|
||||
let local_layer_residence_duration =
|
||||
match SystemTime::now().duration_since(local_layer_mtime) {
|
||||
Err(e) => {
|
||||
warn!("layer mtime is in the future: {}", e);
|
||||
warn!(layer = %local_layer, "layer mtime is in the future: {}", e);
|
||||
None
|
||||
}
|
||||
Ok(delta) => Some(delta),
|
||||
@@ -1160,54 +1202,65 @@ impl Timeline {
|
||||
|
||||
assert_eq!(local_layer.layer_desc(), new_remote_layer.layer_desc());
|
||||
|
||||
let succeed = match layer_mgr.replace_and_verify(local_layer.clone(), new_remote_layer) {
|
||||
Ok(()) => {
|
||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||
}
|
||||
// Always decrement the physical size gauge, even if we failed to delete the file.
|
||||
// Rationale: we already replaced the layer with a remote layer in the layer map,
|
||||
// and any subsequent download_remote_layer will
|
||||
// 1. overwrite the file on disk and
|
||||
// 2. add the downloaded size to the resident size gauge.
|
||||
//
|
||||
// If there is no re-download, and we restart the pageserver, then load_layer_map
|
||||
// will treat the file as a local layer again, count it towards resident size,
|
||||
// and it'll be like the layer removal never happened.
|
||||
// The bump in resident size is perhaps unexpected but overall a robust behavior.
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
layer_mgr
|
||||
.replace_and_verify(local_layer.clone(), new_remote_layer)
|
||||
.map_err(EvictionError::LayerNotFound)?;
|
||||
|
||||
self.metrics.evictions.inc();
|
||||
if let Err(e) = local_layer.delete_resident_layer_file() {
|
||||
// this should never happen, because of layer_removal_cs usage and above stat
|
||||
// access for mtime
|
||||
error!("failed to remove layer file on evict after replacement: {e:#?}");
|
||||
}
|
||||
// Always decrement the physical size gauge, even if we failed to delete the file.
|
||||
// Rationale: we already replaced the layer with a remote layer in the layer map,
|
||||
// and any subsequent download_remote_layer will
|
||||
// 1. overwrite the file on disk and
|
||||
// 2. add the downloaded size to the resident size gauge.
|
||||
//
|
||||
// If there is no re-download, and we restart the pageserver, then load_layer_map
|
||||
// will treat the file as a local layer again, count it towards resident size,
|
||||
// and it'll be like the layer removal never happened.
|
||||
// The bump in resident size is perhaps unexpected but overall a robust behavior.
|
||||
self.metrics
|
||||
.resident_physical_size_gauge
|
||||
.sub(layer_file_size);
|
||||
|
||||
if let Some(delta) = local_layer_residence_duration {
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(delta);
|
||||
info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
|
||||
} else {
|
||||
info!(layer=%local_layer, "evicted layer after unknown residence period");
|
||||
}
|
||||
self.metrics.evictions.inc();
|
||||
|
||||
true
|
||||
}
|
||||
Err(err) => {
|
||||
if cfg!(debug_assertions) {
|
||||
panic!("failed to replace: {err}, evicted: {local_layer:?}");
|
||||
} else {
|
||||
error!(evicted=?local_layer, "failed to replace: {err}");
|
||||
}
|
||||
false
|
||||
}
|
||||
};
|
||||
if let Some(delta) = local_layer_residence_duration {
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.read()
|
||||
.unwrap()
|
||||
.observe(delta);
|
||||
info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
|
||||
} else {
|
||||
info!(layer=%local_layer, "evicted layer after unknown residence period");
|
||||
}
|
||||
|
||||
Ok(succeed)
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum EvictionError {
|
||||
#[error("cannot evict a remote layer")]
|
||||
CannotEvictRemoteLayer,
|
||||
/// Most likely the to-be evicted layer has been deleted by compaction or gc which use the same
|
||||
/// locks, so they got to execute before the eviction.
|
||||
#[error("file backing the layer has been removed already")]
|
||||
FileNotFound,
|
||||
#[error("stat failed")]
|
||||
StatFailed(#[source] std::io::Error),
|
||||
/// In practice, this can be a number of things, but lets assume it means only this.
|
||||
///
|
||||
/// This case includes situations such as the Layer was evicted and redownloaded in between,
|
||||
/// because the file existed before an replacement attempt was made but now the Layers are
|
||||
/// different objects in memory.
|
||||
#[error("layer was no longer part of LayerMap")]
|
||||
LayerNotFound(#[source] anyhow::Error),
|
||||
}
|
||||
|
||||
/// Number of times we will compute partition within a checkpoint distance.
|
||||
const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
|
||||
|
||||
@@ -1307,9 +1360,10 @@ impl Timeline {
|
||||
pg_version: u32,
|
||||
initial_logical_size_can_start: Option<completion::Barrier>,
|
||||
initial_logical_size_attempt: Option<completion::Completion>,
|
||||
state: TimelineState,
|
||||
) -> Arc<Self> {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let (state, _) = watch::channel(TimelineState::Loading);
|
||||
let (state, _) = watch::channel(state);
|
||||
|
||||
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
|
||||
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
|
||||
@@ -1400,7 +1454,7 @@ impl Timeline {
|
||||
eviction_task_timeline_state: tokio::sync::Mutex::new(
|
||||
EvictionTaskTimelineState::default(),
|
||||
),
|
||||
delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
|
||||
|
||||
initial_logical_size_can_start,
|
||||
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
|
||||
@@ -1865,6 +1919,15 @@ impl Timeline {
|
||||
}
|
||||
|
||||
fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
|
||||
let state = self.current_state();
|
||||
if matches!(
|
||||
state,
|
||||
TimelineState::Broken { .. } | TimelineState::Stopping
|
||||
) {
|
||||
// Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
|
||||
return;
|
||||
}
|
||||
|
||||
let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
|
||||
.try_acquire_owned()
|
||||
{
|
||||
@@ -2234,8 +2297,9 @@ impl Timeline {
|
||||
let mut timeline_owned;
|
||||
let mut timeline = self;
|
||||
|
||||
let mut read_count =
|
||||
scopeguard::guard(0, |cnt| self.metrics.read_num_fs_layers.observe(cnt as f64));
|
||||
let mut read_count = scopeguard::guard(0, |cnt| {
|
||||
crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
|
||||
});
|
||||
|
||||
// For debugging purposes, collect the path of layers that we traversed
|
||||
// through. It's included in the error message if we fail to find the key.
|
||||
@@ -2369,12 +2433,15 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
result = match open_layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
result = match open_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
@@ -2396,12 +2463,15 @@ impl Timeline {
|
||||
if cont_lsn > start_lsn {
|
||||
//info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display());
|
||||
let lsn_floor = max(cached_lsn + 1, start_lsn);
|
||||
result = match frozen_layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
result = match frozen_layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
@@ -2432,12 +2502,15 @@ impl Timeline {
|
||||
// Get all the data needed to reconstruct the page version from this layer.
|
||||
// But if we have an older cached page image, no need to go past that.
|
||||
let lsn_floor = max(cached_lsn + 1, lsn_floor);
|
||||
result = match layer.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
) {
|
||||
result = match layer
|
||||
.get_value_reconstruct_data(
|
||||
key,
|
||||
lsn_floor..cont_lsn,
|
||||
reconstruct_state,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(result) => result,
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
@@ -2685,7 +2758,7 @@ impl Timeline {
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
// repository have the same LSN.
|
||||
let lsn_range = frozen_layer.get_lsn_range();
|
||||
let layer_paths_to_upload =
|
||||
let (layer_paths_to_upload, delta_layer_to_add) =
|
||||
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
|
||||
#[cfg(test)]
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
@@ -2704,8 +2777,12 @@ impl Timeline {
|
||||
let (partitioning, _lsn) = self
|
||||
.repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
|
||||
.await?;
|
||||
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
|
||||
.await?
|
||||
// For image layers, we add them immediately into the layer map.
|
||||
(
|
||||
self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
|
||||
.await?,
|
||||
None,
|
||||
)
|
||||
} else {
|
||||
#[cfg(test)]
|
||||
match &mut *self.flush_loop_state.lock().unwrap() {
|
||||
@@ -2719,35 +2796,50 @@ impl Timeline {
|
||||
assert!(!*expect_initdb_optimization, "expected initdb optimization");
|
||||
}
|
||||
}
|
||||
// normal case, write out a L0 delta layer file.
|
||||
let (delta_path, metadata) = self.create_delta_layer(&frozen_layer).await?;
|
||||
HashMap::from([(delta_path, metadata)])
|
||||
// Normal case, write out a L0 delta layer file.
|
||||
// `create_delta_layer` will not modify the layer map.
|
||||
// We will remove frozen layer and add delta layer in one atomic operation later.
|
||||
let layer = self.create_delta_layer(&frozen_layer).await?;
|
||||
(
|
||||
HashMap::from([(layer.filename(), LayerFileMetadata::new(layer.file_size()))]),
|
||||
Some(layer),
|
||||
)
|
||||
};
|
||||
|
||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||
// a compaction can delete the file and then it won't be available for uploads any more.
|
||||
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
|
||||
// race situation.
|
||||
// See https://github.com/neondatabase/neon/issues/4526
|
||||
|
||||
pausable_failpoint!("flush-frozen-before-sync");
|
||||
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
// in-memory layer from the map now. The flushed layer is stored in
|
||||
// the mapping in `create_delta_layer`.
|
||||
{
|
||||
let mut guard = self.layers.write().await;
|
||||
let l = guard.layer_map_mut().frozen_layers.pop_front();
|
||||
|
||||
// Only one thread may call this function at a time (for this
|
||||
// timeline). If two threads tried to flush the same frozen
|
||||
// layer to disk at the same time, that would not work.
|
||||
assert!(compare_arced_layers(&l.unwrap(), &frozen_layer));
|
||||
if let Some(ref l) = delta_layer_to_add {
|
||||
// TODO: move access stats, metrics update, etc. into layer manager.
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
|
||||
// update metrics
|
||||
let sz = l.file_size();
|
||||
self.metrics.resident_physical_size_gauge.add(sz);
|
||||
self.metrics.num_persistent_files_created.inc_by(1);
|
||||
self.metrics.persistent_bytes_written.inc_by(sz);
|
||||
}
|
||||
|
||||
guard.finish_flush_l0_layer(delta_layer_to_add, &frozen_layer);
|
||||
// release lock on 'layers'
|
||||
}
|
||||
|
||||
fail_point!("checkpoint-after-sync");
|
||||
// FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
|
||||
// a compaction can delete the file and then it won't be available for uploads any more.
|
||||
// We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
|
||||
// race situation.
|
||||
// See https://github.com/neondatabase/neon/issues/4526
|
||||
pausable_failpoint!("flush-frozen-pausable");
|
||||
|
||||
// This failpoint is used by another test case `test_pageserver_recovery`.
|
||||
fail_point!("flush-frozen-exit");
|
||||
|
||||
// Update the metadata file, with new 'disk_consistent_lsn'
|
||||
//
|
||||
@@ -2829,11 +2921,12 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file. This L0 file will not be tracked
|
||||
// in layer map immediately. The caller is responsible to put it into the layer map.
|
||||
async fn create_delta_layer(
|
||||
self: &Arc<Self>,
|
||||
frozen_layer: &Arc<InMemoryLayer>,
|
||||
) -> anyhow::Result<(LayerFileName, LayerFileMetadata)> {
|
||||
) -> anyhow::Result<DeltaLayer> {
|
||||
let span = tracing::info_span!("blocking");
|
||||
let new_delta: DeltaLayer = tokio::task::spawn_blocking({
|
||||
let _g = span.entered();
|
||||
@@ -2870,25 +2963,8 @@ impl Timeline {
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")??;
|
||||
let new_delta_name = new_delta.filename();
|
||||
let sz = new_delta.desc.file_size;
|
||||
|
||||
// Add it to the layer map
|
||||
let l = Arc::new(new_delta);
|
||||
let mut guard = self.layers.write().await;
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
guard.track_new_l0_delta_layer(l);
|
||||
|
||||
// update metrics
|
||||
self.metrics.resident_physical_size_gauge.add(sz);
|
||||
self.metrics.num_persistent_files_created.inc_by(1);
|
||||
self.metrics.persistent_bytes_written.inc_by(sz);
|
||||
|
||||
Ok((new_delta_name, LayerFileMetadata::new(sz)))
|
||||
Ok(new_delta)
|
||||
}
|
||||
|
||||
async fn repartition(
|
||||
@@ -3140,7 +3216,7 @@ impl Timeline {
|
||||
|
||||
#[derive(Default)]
|
||||
struct CompactLevel0Phase1Result {
|
||||
new_layers: Vec<DeltaLayer>,
|
||||
new_layers: Vec<Arc<DeltaLayer>>,
|
||||
deltas_to_compact: Vec<Arc<PersistentLayerDesc>>,
|
||||
}
|
||||
|
||||
@@ -3318,6 +3394,37 @@ impl Timeline {
|
||||
return Ok(CompactLevel0Phase1Result::default());
|
||||
}
|
||||
|
||||
// This failpoint is used together with `test_duplicate_layers` integration test.
|
||||
// It returns the compaction result exactly the same layers as input to compaction.
|
||||
// We want to ensure that this will not cause any problem when updating the layer map
|
||||
// after the compaction is finished.
|
||||
//
|
||||
// Currently, there are two rare edge cases that will cause duplicated layers being
|
||||
// inserted.
|
||||
// 1. The compaction job is inturrupted / did not finish successfully. Assume we have file 1, 2, 3, 4, which
|
||||
// is compacted to 5, but the page server is shut down, next time we start page server we will get a layer
|
||||
// map containing 1, 2, 3, 4, and 5, whereas 5 has the same content as 4. If we trigger L0 compation at this
|
||||
// point again, it is likely that we will get a file 6 which has the same content and the key range as 5,
|
||||
// and this causes an overwrite. This is acceptable because the content is the same, and we should do a
|
||||
// layer replace instead of the normal remove / upload process.
|
||||
// 2. The input workload pattern creates exactly n files that are sorted, non-overlapping and is of target file
|
||||
// size length. Compaction will likely create the same set of n files afterwards.
|
||||
//
|
||||
// This failpoint is a superset of both of the cases.
|
||||
fail_point!("compact-level0-phase1-return-same", |_| {
|
||||
println!("compact-level0-phase1-return-same"); // so that we can check if we hit the failpoint
|
||||
Ok(CompactLevel0Phase1Result {
|
||||
new_layers: level0_deltas
|
||||
.iter()
|
||||
.map(|x| x.clone().downcast_delta_layer().unwrap())
|
||||
.collect(),
|
||||
deltas_to_compact: level0_deltas
|
||||
.iter()
|
||||
.map(|x| x.layer_desc().clone().into())
|
||||
.collect(),
|
||||
})
|
||||
});
|
||||
|
||||
// Gather the files to compact in this iteration.
|
||||
//
|
||||
// Start with the oldest Level 0 delta file, and collect any other
|
||||
@@ -3400,7 +3507,7 @@ impl Timeline {
|
||||
let mut prev: Option<Key> = None;
|
||||
for (next_key, _next_lsn, _size) in itertools::process_results(
|
||||
deltas_to_compact.iter().map(|l| l.key_iter(ctx)),
|
||||
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 <= b.0),
|
||||
|iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0),
|
||||
)? {
|
||||
if let Some(prev_key) = prev {
|
||||
// just first fast filter
|
||||
@@ -3440,11 +3547,7 @@ impl Timeline {
|
||||
iter_iter.kmerge_by(|a, b| {
|
||||
if let Ok((a_key, a_lsn, _)) = a {
|
||||
if let Ok((b_key, b_lsn, _)) = b {
|
||||
match a_key.cmp(b_key) {
|
||||
Ordering::Less => true,
|
||||
Ordering::Equal => a_lsn <= b_lsn,
|
||||
Ordering::Greater => false,
|
||||
}
|
||||
(a_key, a_lsn) < (b_key, b_lsn)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
@@ -3462,11 +3565,7 @@ impl Timeline {
|
||||
iter_iter.kmerge_by(|a, b| {
|
||||
let (a_key, a_lsn, _) = a;
|
||||
let (b_key, b_lsn, _) = b;
|
||||
match a_key.cmp(b_key) {
|
||||
Ordering::Less => true,
|
||||
Ordering::Equal => a_lsn <= b_lsn,
|
||||
Ordering::Greater => false,
|
||||
}
|
||||
(a_key, a_lsn) < (b_key, b_lsn)
|
||||
})
|
||||
},
|
||||
)?;
|
||||
@@ -3576,7 +3675,9 @@ impl Timeline {
|
||||
|| contains_hole
|
||||
{
|
||||
// ... if so, flush previous layer and prepare to write new one
|
||||
new_layers.push(writer.take().unwrap().finish(prev_key.unwrap().next())?);
|
||||
new_layers.push(Arc::new(
|
||||
writer.take().unwrap().finish(prev_key.unwrap().next())?,
|
||||
));
|
||||
writer = None;
|
||||
|
||||
if contains_hole {
|
||||
@@ -3614,7 +3715,7 @@ impl Timeline {
|
||||
prev_key = Some(key);
|
||||
}
|
||||
if let Some(writer) = writer {
|
||||
new_layers.push(writer.finish(prev_key.unwrap().next())?);
|
||||
new_layers.push(Arc::new(writer.finish(prev_key.unwrap().next())?));
|
||||
}
|
||||
|
||||
// Sync layers
|
||||
@@ -3723,6 +3824,11 @@ impl Timeline {
|
||||
let mut guard = self.layers.write().await;
|
||||
let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
|
||||
|
||||
// In some rare cases, we may generate a file with exactly the same key range / LSN as before the compaction.
|
||||
// We should move to numbering the layer files instead of naming them using key range / LSN some day. But for
|
||||
// now, we just skip the file to avoid unintentional modification to files on the disk and in the layer map.
|
||||
let mut duplicated_layers = HashSet::new();
|
||||
|
||||
let mut insert_layers = Vec::new();
|
||||
let mut remove_layers = Vec::new();
|
||||
|
||||
@@ -3749,21 +3855,33 @@ impl Timeline {
|
||||
.add(metadata.len());
|
||||
|
||||
new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
|
||||
let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
|
||||
x.access_stats().record_residence_event(
|
||||
l.access_stats().record_residence_event(
|
||||
&guard,
|
||||
LayerResidenceStatus::Resident,
|
||||
LayerResidenceEventReason::LayerCreate,
|
||||
);
|
||||
insert_layers.push(x);
|
||||
let l = l as Arc<dyn PersistentLayer>;
|
||||
if guard.contains(&l) {
|
||||
duplicated_layers.insert(l.layer_desc().key());
|
||||
} else {
|
||||
if LayerMap::is_l0(l.layer_desc()) {
|
||||
return Err(CompactionError::Other(anyhow!("compaction generates a L0 layer file as output, which will cause infinite compaction.")));
|
||||
}
|
||||
insert_layers.push(l);
|
||||
}
|
||||
}
|
||||
|
||||
// Now that we have reshuffled the data to set of new delta layers, we can
|
||||
// delete the old ones
|
||||
let mut layer_names_to_delete = Vec::with_capacity(deltas_to_compact.len());
|
||||
for l in deltas_to_compact {
|
||||
layer_names_to_delete.push(l.filename());
|
||||
remove_layers.push(guard.get_from_desc(&l));
|
||||
for ldesc in deltas_to_compact {
|
||||
if duplicated_layers.contains(&ldesc.key()) {
|
||||
// skip duplicated layers, they will not be removed; we have already overwritten them
|
||||
// with new layers in the compaction phase 1.
|
||||
continue;
|
||||
}
|
||||
layer_names_to_delete.push(ldesc.filename());
|
||||
remove_layers.push(guard.get_from_desc(&ldesc));
|
||||
}
|
||||
|
||||
guard.finish_compact_l0(
|
||||
@@ -4522,6 +4640,7 @@ impl LocalLayerInfoForDiskUsageEviction {
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
/// Returns non-remote layers for eviction.
|
||||
pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map();
|
||||
@@ -4691,3 +4810,179 @@ pub fn compare_arced_layers<L: ?Sized>(left: &Arc<L>, right: &Arc<L>) -> bool {
|
||||
|
||||
left == right
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::tenant::{harness::TenantHarness, storage_layer::PersistentLayer};
|
||||
|
||||
use super::{EvictionError, Timeline};
|
||||
|
||||
#[tokio::test]
|
||||
async fn two_layer_eviction_attempts_at_the_same_time() {
|
||||
let harness =
|
||||
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
|
||||
|
||||
let remote_storage = {
|
||||
// this is never used for anything, because of how the create_test_timeline works, but
|
||||
// it is with us in spirit and a Some.
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
|
||||
let path = harness.conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&path).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(path),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let rc = timeline
|
||||
.remote_client
|
||||
.clone()
|
||||
.expect("just configured this");
|
||||
|
||||
let layer = find_some_layer(&timeline).await;
|
||||
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
let batch = [layer];
|
||||
|
||||
let first = {
|
||||
let cancel = cancel.clone();
|
||||
async {
|
||||
timeline
|
||||
.evict_layer_batch(&rc, &batch, cancel)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
};
|
||||
let second = async {
|
||||
timeline
|
||||
.evict_layer_batch(&rc, &batch, cancel)
|
||||
.await
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
let (first, second) = tokio::join!(first, second);
|
||||
|
||||
let (first, second) = (only_one(first), only_one(second));
|
||||
|
||||
match (first, second) {
|
||||
(Ok(()), Err(EvictionError::FileNotFound))
|
||||
| (Err(EvictionError::FileNotFound), Ok(())) => {
|
||||
// one of the evictions gets to do it,
|
||||
// other one gets FileNotFound. all is good.
|
||||
}
|
||||
other => unreachable!("unexpected {:?}", other),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn layer_eviction_aba_fails() {
|
||||
let harness = TenantHarness::create("layer_eviction_aba_fails").unwrap();
|
||||
|
||||
let remote_storage = {
|
||||
// this is never used for anything, because of how the create_test_timeline works, but
|
||||
// it is with us in spirit and a Some.
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind};
|
||||
let path = harness.conf.workdir.join("localfs");
|
||||
std::fs::create_dir_all(&path).unwrap();
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
|
||||
max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(path),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config).unwrap()
|
||||
};
|
||||
|
||||
let ctx = any_context();
|
||||
let tenant = harness.try_load(&ctx, Some(remote_storage)).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let _e = tracing::info_span!("foobar", tenant_id = %tenant.tenant_id, timeline_id = %timeline.timeline_id).entered();
|
||||
|
||||
let rc = timeline.remote_client.clone().unwrap();
|
||||
|
||||
// TenantHarness allows uploads to happen given GenericRemoteStorage is configured
|
||||
let layer = find_some_layer(&timeline).await;
|
||||
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
let batch = [layer];
|
||||
|
||||
let first = {
|
||||
let cancel = cancel.clone();
|
||||
async {
|
||||
timeline
|
||||
.evict_layer_batch(&rc, &batch, cancel)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
};
|
||||
|
||||
// lets imagine this is stuck somehow, still referencing the original `Arc<dyn PersistentLayer>`
|
||||
let second = {
|
||||
let cancel = cancel.clone();
|
||||
async {
|
||||
timeline
|
||||
.evict_layer_batch(&rc, &batch, cancel)
|
||||
.await
|
||||
.unwrap()
|
||||
}
|
||||
};
|
||||
|
||||
// while it's stuck, we evict and end up redownloading it
|
||||
only_one(first.await).expect("eviction succeeded");
|
||||
|
||||
let layer = find_some_layer(&timeline).await;
|
||||
let layer = layer.downcast_remote_layer().unwrap();
|
||||
timeline.download_remote_layer(layer).await.unwrap();
|
||||
|
||||
let res = only_one(second.await);
|
||||
|
||||
assert!(
|
||||
matches!(res, Err(EvictionError::LayerNotFound(_))),
|
||||
"{res:?}"
|
||||
);
|
||||
|
||||
// no more specific asserting, outside of preconds this is the only valid replacement
|
||||
// failure
|
||||
}
|
||||
|
||||
fn any_context() -> crate::context::RequestContext {
|
||||
use crate::context::*;
|
||||
use crate::task_mgr::*;
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
|
||||
}
|
||||
|
||||
fn only_one<T>(mut input: Vec<Option<T>>) -> T {
|
||||
assert_eq!(1, input.len());
|
||||
input
|
||||
.pop()
|
||||
.expect("length just checked")
|
||||
.expect("no cancellation")
|
||||
}
|
||||
|
||||
async fn find_some_layer(timeline: &Timeline) -> Arc<dyn PersistentLayer> {
|
||||
let layers = timeline.layers.read().await;
|
||||
let desc = layers
|
||||
.layer_map()
|
||||
.iter_historic_layers()
|
||||
.next()
|
||||
.expect("must find one layer to evict");
|
||||
|
||||
layers.get_from_desc(&desc)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ use crate::{
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
timeline::EvictionError,
|
||||
LogicalSizeCalculationCause, Tenant,
|
||||
},
|
||||
};
|
||||
@@ -100,11 +101,11 @@ impl Timeline {
|
||||
match cf {
|
||||
ControlFlow::Break(()) => break,
|
||||
ControlFlow::Continue(sleep_until) => {
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {
|
||||
break;
|
||||
}
|
||||
_ = tokio::time::sleep_until(sleep_until) => { }
|
||||
if tokio::time::timeout_at(sleep_until, cancel.cancelled())
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -270,20 +271,22 @@ impl Timeline {
|
||||
None => {
|
||||
stats.skipped_for_shutdown += 1;
|
||||
}
|
||||
Some(Ok(true)) => {
|
||||
debug!("evicted layer {l:?}");
|
||||
Some(Ok(())) => {
|
||||
stats.evicted += 1;
|
||||
}
|
||||
Some(Ok(false)) => {
|
||||
debug!("layer is not evictable: {l:?}");
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
// This variant is the case where an unexpected error happened during eviction.
|
||||
// Expected errors that result in non-eviction are `Some(Ok(false))`.
|
||||
// So, dump Debug here to gather as much info as possible in this rare case.
|
||||
warn!("failed to evict layer {l:?}: {e:?}");
|
||||
stats.errors += 1;
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
// compaction/gc removed the file while we were waiting on layer_removal_cs
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_) | e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(layer = %l, "failed to evict layer: {e}");
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,10 +194,23 @@ impl LayerManager {
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
/// Insert into the layer map when a new delta layer is created, called from `create_delta_layer`.
|
||||
pub fn track_new_l0_delta_layer(&mut self, delta_layer: Arc<DeltaLayer>) {
|
||||
/// Flush a frozen layer and add the written delta layer to the layer map.
|
||||
pub fn finish_flush_l0_layer(
|
||||
&mut self,
|
||||
delta_layer: Option<DeltaLayer>,
|
||||
frozen_layer_for_check: &Arc<InMemoryLayer>,
|
||||
) {
|
||||
let l = self.layer_map.frozen_layers.pop_front();
|
||||
let mut updates = self.layer_map.batch_update();
|
||||
Self::insert_historic_layer(delta_layer, &mut updates, &mut self.layer_fmgr);
|
||||
|
||||
// Only one thread may call this function at a time (for this
|
||||
// timeline). If two threads tried to flush the same frozen
|
||||
// layer to disk at the same time, that would not work.
|
||||
assert!(compare_arced_layers(&l.unwrap(), frozen_layer_for_check));
|
||||
|
||||
if let Some(delta_layer) = delta_layer {
|
||||
Self::insert_historic_layer(Arc::new(delta_layer), &mut updates, &mut self.layer_fmgr);
|
||||
}
|
||||
updates.flush();
|
||||
}
|
||||
|
||||
@@ -295,6 +308,10 @@ impl LayerManager {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Arc<dyn PersistentLayer>) -> bool {
|
||||
self.layer_fmgr.contains(layer)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayerFileManager<T: AsLayerDesc + ?Sized = dyn PersistentLayer>(
|
||||
@@ -319,6 +336,10 @@ impl<T: AsLayerDesc + ?Sized> LayerFileManager<T> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn contains(&self, layer: &Arc<T>) -> bool {
|
||||
self.0.contains_key(&layer.layer_desc().key())
|
||||
}
|
||||
|
||||
pub(crate) fn new() -> Self {
|
||||
Self(HashMap::new())
|
||||
}
|
||||
|
||||
@@ -2,13 +2,9 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use tracing::{error, info, info_span, warn};
|
||||
use utils::{crashsafe, id::TimelineId, lsn::Lsn};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
import_datadir,
|
||||
tenant::{ignore_absent_files, Tenant},
|
||||
};
|
||||
use crate::{context::RequestContext, import_datadir, tenant::Tenant};
|
||||
|
||||
use super::Timeline;
|
||||
|
||||
@@ -141,7 +137,7 @@ impl Drop for UninitializedTimeline<'_> {
|
||||
|
||||
pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
|
||||
let timeline_path = &uninit_mark.timeline_path;
|
||||
match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
|
||||
match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
|
||||
Ok(()) => {
|
||||
info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
|
||||
}
|
||||
@@ -185,7 +181,7 @@ impl TimelineUninitMark {
|
||||
let uninit_mark_parent = uninit_mark_file
|
||||
.parent()
|
||||
.with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
|
||||
ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
|
||||
fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
|
||||
format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
|
||||
})?;
|
||||
crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
|
||||
|
||||
@@ -1123,7 +1123,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
@@ -1189,8 +1189,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
|
||||
async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
@@ -1252,8 +1252,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
|
||||
async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let new_lsn = Lsn(100_100).align();
|
||||
|
||||
@@ -149,12 +149,10 @@ impl OpenFiles {
|
||||
// old file.
|
||||
//
|
||||
if let Some(old_file) = slot_guard.file.take() {
|
||||
// We do not have information about tenant_id/timeline_id of evicted file.
|
||||
// It is possible to store path together with file or use filepath crate,
|
||||
// but as far as close() is not expected to be fast, it is not so critical to gather
|
||||
// precise per-tenant statistic here.
|
||||
// the normal path of dropping VirtualFile uses "close", use "close-by-replace" here to
|
||||
// distinguish the two.
|
||||
STORAGE_IO_TIME
|
||||
.with_label_values(&["close", "-", "-"])
|
||||
.with_label_values(&["close-by-replace"])
|
||||
.observe_closure_duration(|| drop(old_file));
|
||||
}
|
||||
|
||||
@@ -208,7 +206,7 @@ impl VirtualFile {
|
||||
}
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
|
||||
let file = STORAGE_IO_TIME
|
||||
.with_label_values(&["open", &tenant_id, &timeline_id])
|
||||
.with_label_values(&["open"])
|
||||
.observe_closure_duration(|| open_options.open(path))?;
|
||||
|
||||
// Strip all options other than read and write.
|
||||
@@ -271,7 +269,7 @@ impl VirtualFile {
|
||||
// Found a cached file descriptor.
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
return Ok(STORAGE_IO_TIME
|
||||
.with_label_values(&[op, &self.tenant_id, &self.timeline_id])
|
||||
.with_label_values(&[op])
|
||||
.observe_closure_duration(|| func(file)));
|
||||
}
|
||||
}
|
||||
@@ -298,12 +296,12 @@ impl VirtualFile {
|
||||
|
||||
// Open the physical file
|
||||
let file = STORAGE_IO_TIME
|
||||
.with_label_values(&["open", &self.tenant_id, &self.timeline_id])
|
||||
.with_label_values(&["open"])
|
||||
.observe_closure_duration(|| self.open_options.open(&self.path))?;
|
||||
|
||||
// Perform the requested operation on it
|
||||
let result = STORAGE_IO_TIME
|
||||
.with_label_values(&[op, &self.tenant_id, &self.timeline_id])
|
||||
.with_label_values(&[op])
|
||||
.observe_closure_duration(|| func(&file));
|
||||
|
||||
// Store the File in the slot and update the handle in the VirtualFile
|
||||
@@ -333,13 +331,11 @@ impl Drop for VirtualFile {
|
||||
let mut slot_guard = slot.inner.write().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
slot.recently_used.store(false, Ordering::Relaxed);
|
||||
// Unlike files evicted by replacement algorithm, here
|
||||
// we group close time by tenant_id/timeline_id.
|
||||
// At allows to compare number/time of "normal" file closes
|
||||
// with file eviction.
|
||||
// there is also operation "close-by-replace" for closes done on eviction for
|
||||
// comparison.
|
||||
STORAGE_IO_TIME
|
||||
.with_label_values(&["close", &self.tenant_id, &self.timeline_id])
|
||||
.observe_closure_duration(|| slot_guard.file.take());
|
||||
.with_label_values(&["close"])
|
||||
.observe_closure_duration(|| drop(slot_guard.file.take()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -360,7 +360,6 @@ impl XlXactParsedRecord {
|
||||
}
|
||||
}
|
||||
let mut xnodes = Vec::<RelFileNode>::new();
|
||||
// In v16 this XACT_XINFO_HAS_RELFILENODES is renamed to XACT_XINFO_HAS_RELFILELOCATORS
|
||||
if xinfo & pg_constants::XACT_XINFO_HAS_RELFILENODES != 0 {
|
||||
let nrels = buf.get_i32_le();
|
||||
for _i in 0..nrels {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
comment = 'hnsw index'
|
||||
comment = '** Deprecated ** Please use pg_embedding instead'
|
||||
default_version = '0.1.0'
|
||||
module_pathname = '$libdir/hnsw'
|
||||
relocatable = true
|
||||
|
||||
@@ -25,11 +25,7 @@
|
||||
#include "pagestore_client.h"
|
||||
#include "access/parallel.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#include "storage/relfilelocator.h"
|
||||
#else
|
||||
#include "storage/relfilenode.h"
|
||||
#endif
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/latch.h"
|
||||
#include "storage/ipc.h"
|
||||
@@ -43,7 +39,6 @@
|
||||
#include "postmaster/bgworker.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
|
||||
|
||||
/*
|
||||
* Local file cache is used to temporary store relations pages in local file system.
|
||||
* All blocks of all relations are stored inside one file and addressed using shared hash map.
|
||||
@@ -365,12 +360,9 @@ lfc_cache_contains(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#else
|
||||
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#endif
|
||||
|
||||
tag.rnode = rnode;
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_SHARED);
|
||||
@@ -395,11 +387,7 @@ lfc_evict(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno)
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#else
|
||||
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#endif
|
||||
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
@@ -469,12 +457,10 @@ lfc_read(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return false;
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#else
|
||||
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#endif
|
||||
|
||||
tag.rnode = rnode;
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -540,12 +526,9 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (lfc_size_limit == 0) /* fast exit if file cache is disabled */
|
||||
return;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&tag, &rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#else
|
||||
INIT_BUFFERTAG(tag, rnode, forkNum, (blkno & ~(BLOCKS_PER_CHUNK-1)));
|
||||
#endif
|
||||
|
||||
tag.rnode = rnode;
|
||||
tag.forkNum = forkNum;
|
||||
tag.blockNum = blkno & ~(BLOCKS_PER_CHUNK-1);
|
||||
hash = get_hash_value(lfc_hash, &tag);
|
||||
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
@@ -739,16 +722,9 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
if (entry->bitmap[i >> 5] & (1 << (i & 31)))
|
||||
{
|
||||
fctx->record[n_pages].pageoffs = entry->offset*BLOCKS_PER_CHUNK + i;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
fctx->record[n_pages].relfilenode = entry->key.relNumber;
|
||||
fctx->record[n_pages].reltablespace = entry->key.spcOid;
|
||||
fctx->record[n_pages].reldatabase = entry->key.dbOid;
|
||||
#else
|
||||
fctx->record[n_pages].relfilenode = entry->key.rnode.relNode;
|
||||
fctx->record[n_pages].reltablespace = entry->key.rnode.spcNode;
|
||||
fctx->record[n_pages].reldatabase = entry->key.rnode.dbNode;
|
||||
#endif
|
||||
fctx->record[n_pages].forknum = entry->key.forkNum;
|
||||
fctx->record[n_pages].blocknum = entry->key.blockNum + i;
|
||||
fctx->record[n_pages].accesscount = entry->access_count;
|
||||
|
||||
@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
|
||||
/*
|
||||
* The docs for PQgetCopyData list the return values as: 0 if the copy is
|
||||
* still in progress, but no "complete row" is available -1 if the copy is
|
||||
* done -2 if an error occured (> 0) if it was successful; that value is
|
||||
* done -2 if an error occurred (> 0) if it was successful; that value is
|
||||
* the amount transferred.
|
||||
*
|
||||
* The protocol we use between walproposer and safekeeper means that we
|
||||
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
|
||||
/*
|
||||
* The docs for PQputcopyData list the return values as: 1 if the data was
|
||||
* queued, 0 if it was not queued because of full buffers, or -1 if an
|
||||
* error occured
|
||||
* error occurred
|
||||
*/
|
||||
result = PQputCopyData(conn->pg_conn, buf, size);
|
||||
|
||||
|
||||
@@ -16,11 +16,7 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#include "storage/relfilelocator.h"
|
||||
#else
|
||||
#include "storage/relfilenode.h"
|
||||
#endif
|
||||
#include "storage/block.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "lib/stringinfo.h"
|
||||
@@ -29,34 +25,6 @@
|
||||
|
||||
#include "pg_config.h"
|
||||
|
||||
// This is a hack to avoid too many ifdefs in the function definitions.
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
typedef RelFileLocator RelFileNode;
|
||||
typedef RelFileLocatorBackend RelFileNodeBackend;
|
||||
#define RelFileNodeBackendIsTemp RelFileLocatorBackendIsTemp
|
||||
#endif
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#define RelnGetRnode(reln) (reln->smgr_rlocator.locator)
|
||||
#define RnodeGetSpcOid(rnode) (rnode.spcOid)
|
||||
#define RnodeGetDbOid(rnode) (rnode.dbOid)
|
||||
#define RnodeGetRelNumber(rnode) (rnode.relNumber)
|
||||
|
||||
#define BufTagGetRnode(tag) (BufTagGetRelFileLocator(&tag))
|
||||
#else
|
||||
#define RelnGetRnode(reln) (reln->smgr_rnode.node)
|
||||
#define RnodeGetSpcOid(rnode) (rnode.spcNode)
|
||||
#define RnodeGetDbOid(rnode) (rnode.dbNode)
|
||||
#define RnodeGetRelNumber(rnode) (rnode.relNode)
|
||||
|
||||
#define BufTagGetRnode(tag) (tag.rnode)
|
||||
|
||||
#endif
|
||||
|
||||
#define RelnGetSpcOid(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||
#define RelnGetDbOid(reln) (RnodeGetDbOid(RelnGetRnode(reln)))
|
||||
#define RelnGetRelNumber(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||
|
||||
typedef enum
|
||||
{
|
||||
/* pagestore_client -> pagestore */
|
||||
@@ -117,7 +85,7 @@ typedef struct
|
||||
typedef struct
|
||||
{
|
||||
NeonRequest req;
|
||||
Oid dbOid;
|
||||
Oid dbNode;
|
||||
} NeonDbSizeRequest;
|
||||
|
||||
typedef struct
|
||||
|
||||
@@ -58,11 +58,7 @@
|
||||
#include "postmaster/autovacuum.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#include "storage/relfilelocator.h"
|
||||
#else
|
||||
#include "storage/relfilenode.h"
|
||||
#endif
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/md.h"
|
||||
@@ -74,8 +70,6 @@
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/*
|
||||
* If DEBUG_COMPARE_LOCAL is defined, we pass through all the SMGR API
|
||||
* calls to md.c, and *also* do the calls to the Page Server. On every
|
||||
@@ -92,10 +86,7 @@
|
||||
static char *hexdump_page(char *page);
|
||||
#endif
|
||||
|
||||
|
||||
#define IS_LOCAL_REL(reln) (RelnGetDbOid(reln) != 0 && RelnGetRelNumber(reln) > FirstNormalObjectId)
|
||||
|
||||
|
||||
#define IS_LOCAL_REL(reln) (reln->smgr_rnode.node.dbNode != 0 && reln->smgr_rnode.node.relNode > FirstNormalObjectId)
|
||||
|
||||
const int SmgrTrace = DEBUG5;
|
||||
|
||||
@@ -193,13 +184,7 @@ typedef struct PrfHashEntry {
|
||||
sizeof(BufferTag) \
|
||||
)
|
||||
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&((a)->buftag),&((b)->buftag)))
|
||||
#else
|
||||
#define SH_EQUAL(tb, a, b) (BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
|
||||
#endif
|
||||
|
||||
#define SH_SCOPE static inline
|
||||
#define SH_DEFINE
|
||||
#define SH_DECLARE
|
||||
@@ -649,7 +634,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
.req.latest = false,
|
||||
.req.lsn = 0,
|
||||
.rnode = BufTagGetRnode(slot->buftag),
|
||||
.rnode = slot->buftag.rnode,
|
||||
.forknum = slot->buftag.forkNum,
|
||||
.blkno = slot->buftag.blockNum,
|
||||
};
|
||||
@@ -664,7 +649,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
|
||||
{
|
||||
XLogRecPtr lsn = neon_get_request_lsn(
|
||||
&request.req.latest,
|
||||
BufTagGetRnode(slot->buftag),
|
||||
slot->buftag.rnode,
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum
|
||||
);
|
||||
@@ -744,11 +729,8 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
Assert(slot->status != PRFS_UNUSED);
|
||||
Assert(MyPState->ring_last <= ring_index &&
|
||||
ring_index < MyPState->ring_unused);
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
Assert(BufferTagsEqual(&slot->buftag, &tag));
|
||||
#else
|
||||
Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If we want a specific lsn, we do not accept requests that were made
|
||||
* with a potentially different LSN.
|
||||
@@ -911,9 +893,9 @@ nm_pack_request(NeonRequest * msg)
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
|
||||
pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
|
||||
pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
|
||||
pq_sendint32(&s, msg_req->rnode.spcNode);
|
||||
pq_sendint32(&s, msg_req->rnode.dbNode);
|
||||
pq_sendint32(&s, msg_req->rnode.relNode);
|
||||
pq_sendbyte(&s, msg_req->forknum);
|
||||
|
||||
break;
|
||||
@@ -924,9 +906,9 @@ nm_pack_request(NeonRequest * msg)
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
|
||||
pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
|
||||
pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
|
||||
pq_sendint32(&s, msg_req->rnode.spcNode);
|
||||
pq_sendint32(&s, msg_req->rnode.dbNode);
|
||||
pq_sendint32(&s, msg_req->rnode.relNode);
|
||||
pq_sendbyte(&s, msg_req->forknum);
|
||||
|
||||
break;
|
||||
@@ -937,7 +919,7 @@ nm_pack_request(NeonRequest * msg)
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, msg_req->dbOid);
|
||||
pq_sendint32(&s, msg_req->dbNode);
|
||||
|
||||
break;
|
||||
}
|
||||
@@ -947,9 +929,9 @@ nm_pack_request(NeonRequest * msg)
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, RnodeGetSpcOid(msg_req->rnode));
|
||||
pq_sendint32(&s, RnodeGetDbOid(msg_req->rnode));
|
||||
pq_sendint32(&s, RnodeGetRelNumber(msg_req->rnode));
|
||||
pq_sendint32(&s, msg_req->rnode.spcNode);
|
||||
pq_sendint32(&s, msg_req->rnode.dbNode);
|
||||
pq_sendint32(&s, msg_req->rnode.relNode);
|
||||
pq_sendbyte(&s, msg_req->forknum);
|
||||
pq_sendint32(&s, msg_req->blkno);
|
||||
|
||||
@@ -1082,9 +1064,9 @@ nm_to_string(NeonMessage * msg)
|
||||
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
|
||||
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
||||
RnodeGetSpcOid(msg_req->rnode),
|
||||
RnodeGetDbOid(msg_req->rnode),
|
||||
RnodeGetRelNumber(msg_req->rnode));
|
||||
msg_req->rnode.spcNode,
|
||||
msg_req->rnode.dbNode,
|
||||
msg_req->rnode.relNode);
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
@@ -1098,9 +1080,9 @@ nm_to_string(NeonMessage * msg)
|
||||
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
|
||||
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
||||
RnodeGetSpcOid(msg_req->rnode),
|
||||
RnodeGetDbOid(msg_req->rnode),
|
||||
RnodeGetRelNumber(msg_req->rnode));
|
||||
msg_req->rnode.spcNode,
|
||||
msg_req->rnode.dbNode,
|
||||
msg_req->rnode.relNode);
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
@@ -1114,9 +1096,9 @@ nm_to_string(NeonMessage * msg)
|
||||
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
|
||||
appendStringInfo(&s, ", \"rnode\": \"%u/%u/%u\"",
|
||||
RnodeGetSpcOid(msg_req->rnode),
|
||||
RnodeGetDbOid(msg_req->rnode),
|
||||
RnodeGetRelNumber(msg_req->rnode));
|
||||
msg_req->rnode.spcNode,
|
||||
msg_req->rnode.dbNode,
|
||||
msg_req->rnode.relNode);
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
@@ -1129,7 +1111,7 @@ nm_to_string(NeonMessage * msg)
|
||||
NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
|
||||
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
|
||||
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbOid);
|
||||
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
appendStringInfoChar(&s, '}');
|
||||
@@ -1231,7 +1213,6 @@ static void
|
||||
neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool force)
|
||||
{
|
||||
XLogRecPtr lsn = PageGetLSN(buffer);
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
if (ShutdownRequestPending)
|
||||
return;
|
||||
@@ -1251,16 +1232,15 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
/* FSM is never WAL-logged and we don't care. */
|
||||
XLogRecPtr recptr;
|
||||
|
||||
|
||||
recptr = log_newpage_copy(&rnode, forknum, blocknum, buffer, false);
|
||||
recptr = log_newpage_copy(&reln->smgr_rnode.node, forknum, blocknum, buffer, false);
|
||||
XLogFlush(recptr);
|
||||
lsn = recptr;
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u was force logged. Evicted at lsn=%X/%X",
|
||||
blocknum,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
}
|
||||
else if (lsn == InvalidXLogRecPtr)
|
||||
@@ -1288,9 +1268,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is all-zeros",
|
||||
blocknum,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum)));
|
||||
}
|
||||
else if (PageIsEmptyHeapPage(buffer))
|
||||
@@ -1298,9 +1278,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is an empty heap page with no LSN",
|
||||
blocknum,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum)));
|
||||
}
|
||||
else
|
||||
@@ -1308,9 +1288,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
ereport(PANIC,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is evicted with zero LSN",
|
||||
blocknum,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum)));
|
||||
}
|
||||
}
|
||||
@@ -1319,9 +1299,9 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("Page %u of relation %u/%u/%u.%u is already wal logged at lsn=%X/%X",
|
||||
blocknum,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum, LSN_FORMAT_ARGS(lsn))));
|
||||
}
|
||||
|
||||
@@ -1329,7 +1309,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
|
||||
* Remember the LSN on this page. When we read the page again, we must
|
||||
* read the same or newer version of it.
|
||||
*/
|
||||
SetLastWrittenLSNForBlock(lsn, rnode, forknum, blocknum);
|
||||
SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forknum, blocknum);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1479,7 +1459,6 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
BlockNumber n_blocks;
|
||||
bool latest;
|
||||
XLogRecPtr request_lsn;
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1506,7 +1485,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (get_cached_relsize(RelnGetRnode(reln), forkNum, &n_blocks))
|
||||
if (get_cached_relsize(reln->smgr_rnode.node, forkNum, &n_blocks))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@@ -1521,20 +1500,20 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
*
|
||||
* For now, handle that special case here.
|
||||
*/
|
||||
if (RelnGetSpcOid(reln) == 0 &&
|
||||
RelnGetDbOid(reln) == 0 &&
|
||||
RelnGetRelNumber(reln) == 0)
|
||||
if (reln->smgr_rnode.node.spcNode == 0 &&
|
||||
reln->smgr_rnode.node.dbNode == 0 &&
|
||||
reln->smgr_rnode.node.relNode == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
request_lsn = neon_get_request_lsn(&latest, rnode, forkNum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||
request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||
{
|
||||
NeonExistsRequest request = {
|
||||
.req.tag = T_NeonExistsRequest,
|
||||
.req.latest = latest,
|
||||
.req.lsn = request_lsn,
|
||||
.rnode = rnode,
|
||||
.rnode = reln->smgr_rnode.node,
|
||||
.forknum = forkNum};
|
||||
|
||||
resp = page_server_request(&request);
|
||||
@@ -1550,9 +1529,9 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
errdetail("page server returned error: %s",
|
||||
@@ -1574,8 +1553,6 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
void
|
||||
neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
{
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
@@ -1594,8 +1571,9 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "Create relation %u/%u/%u.%u",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forkNum);
|
||||
|
||||
/*
|
||||
@@ -1619,12 +1597,12 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
|
||||
*/
|
||||
if (isRedo)
|
||||
{
|
||||
update_cached_relsize(rnode, forkNum, 0);
|
||||
get_cached_relsize(rnode, forkNum,
|
||||
update_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
|
||||
get_cached_relsize(reln->smgr_rnode.node, forkNum,
|
||||
&reln->smgr_cached_nblocks[forkNum]);
|
||||
}
|
||||
else
|
||||
set_cached_relsize(rnode, forkNum, 0);
|
||||
set_cached_relsize(reln->smgr_rnode.node, forkNum, 0);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1661,12 +1639,7 @@ neon_unlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
|
||||
mdunlink(rnode, forkNum, isRedo);
|
||||
if (!RelFileNodeBackendIsTemp(rnode))
|
||||
{
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
forget_cached_relsize(rnode.locator, forkNum);
|
||||
#else
|
||||
forget_cached_relsize(rnode.node, forkNum);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1685,7 +1658,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
BlockNumber n_blocks = 0;
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1735,16 +1707,17 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
|
||||
|
||||
neon_wallog_page(reln, forkNum, blkno, buffer, false);
|
||||
set_cached_relsize(rnode, forkNum, blkno + 1);
|
||||
set_cached_relsize(reln->smgr_rnode.node, forkNum, blkno + 1);
|
||||
|
||||
lsn = PageGetLSN(buffer);
|
||||
elog(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forkNum, blkno,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
|
||||
lfc_write(rnode, forkNum, blkno, buffer);
|
||||
lfc_write(reln->smgr_rnode.node, forkNum, blkno, buffer);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -1759,9 +1732,9 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
SetLastWrittenLSNForBlock(lsn, rnode, forkNum, blkno);
|
||||
SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno);
|
||||
}
|
||||
SetLastWrittenLSNForRelation(lsn, rnode, forkNum);
|
||||
SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1805,8 +1778,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
BufferTag tag;
|
||||
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
|
||||
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0: /* probably shouldn't happen, but ignore it */
|
||||
@@ -1821,18 +1792,15 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (lfc_cache_contains(rnode, forknum, blocknum))
|
||||
if (lfc_cache_contains(reln->smgr_rnode.node, forknum, blocknum))
|
||||
return false;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&tag, &rnode, forknum, blocknum);
|
||||
#else
|
||||
tag = (BufferTag) {
|
||||
.rnode = rnode,
|
||||
.rnode = reln->smgr_rnode.node,
|
||||
.forkNum = forknum,
|
||||
.blockNum = blocknum
|
||||
};
|
||||
#endif
|
||||
|
||||
ring_index = prefetch_register_buffer(tag, NULL, NULL);
|
||||
|
||||
Assert(ring_index < MyPState->ring_unused &&
|
||||
@@ -1893,15 +1861,11 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
PrfHashEntry *entry;
|
||||
PrefetchRequest *slot;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&buftag, &rnode, forkNum, blkno);
|
||||
#else
|
||||
buftag = (BufferTag) {
|
||||
.rnode = rnode,
|
||||
.forkNum = forkNum,
|
||||
.blockNum = blkno
|
||||
.blockNum = blkno,
|
||||
};
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The redo process does not lock pages that it needs to replay but are
|
||||
@@ -2001,9 +1965,9 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
blkno,
|
||||
RnodeGetSpcOid(rnode),
|
||||
RnodeGetDbOid(rnode),
|
||||
RnodeGetRelNumber(rnode),
|
||||
rnode.spcNode,
|
||||
rnode.dbNode,
|
||||
rnode.relNode,
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
errdetail("page server returned error: %s",
|
||||
@@ -2027,7 +1991,6 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
bool latest;
|
||||
XLogRecPtr request_lsn;
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -2047,13 +2010,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
|
||||
/* Try to read from local file cache */
|
||||
if (lfc_read(RelnGetRnode(reln), forkNum, blkno, buffer))
|
||||
if (lfc_read(reln->smgr_rnode.node, forkNum, blkno, buffer))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
request_lsn = neon_get_request_lsn(&latest, rnode, forkNum, blkno);
|
||||
neon_read_at_lsn(rnode, forkNum, blkno, request_lsn, latest, buffer);
|
||||
request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forkNum, blkno);
|
||||
neon_read_at_lsn(reln->smgr_rnode.node, forkNum, blkno, request_lsn, latest, buffer);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
@@ -2073,9 +2036,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
elog(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
blkno,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
hexdump_page(buffer));
|
||||
@@ -2085,9 +2048,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
elog(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n",
|
||||
blkno,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
hexdump_page(mdbuf));
|
||||
@@ -2102,9 +2065,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
elog(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
blkno,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
hexdump_page(mdbuf_masked),
|
||||
@@ -2123,9 +2086,9 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
elog(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n",
|
||||
blkno,
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forkNum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
hexdump_page(mdbuf_masked),
|
||||
@@ -2170,7 +2133,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
case 0:
|
||||
@@ -2207,12 +2170,13 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
|
||||
lsn = PageGetLSN(buffer);
|
||||
elog(SmgrTrace, "smgrwrite called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum, blocknum,
|
||||
(uint32) (lsn >> 32), (uint32) lsn);
|
||||
|
||||
lfc_write(rnode, forknum, blocknum, buffer);
|
||||
lfc_write(reln->smgr_rnode.node, forknum, blocknum, buffer);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -2230,7 +2194,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
BlockNumber n_blocks;
|
||||
bool latest;
|
||||
XLogRecPtr request_lsn;
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -2249,23 +2212,23 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
if (get_cached_relsize(RelnGetRnode(reln), forknum, &n_blocks))
|
||||
if (get_cached_relsize(reln->smgr_rnode.node, forknum, &n_blocks))
|
||||
{
|
||||
elog(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum, n_blocks);
|
||||
return n_blocks;
|
||||
}
|
||||
|
||||
request_lsn = neon_get_request_lsn(&latest, rnode, forknum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||
request_lsn = neon_get_request_lsn(&latest, reln->smgr_rnode.node, forknum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||
{
|
||||
NeonNblocksRequest request = {
|
||||
.req.tag = T_NeonNblocksRequest,
|
||||
.req.latest = latest,
|
||||
.req.lsn = request_lsn,
|
||||
.rnode = rnode,
|
||||
.rnode = reln->smgr_rnode.node,
|
||||
.forknum = forknum,
|
||||
};
|
||||
|
||||
@@ -2282,9 +2245,9 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
errdetail("page server returned error: %s",
|
||||
@@ -2294,11 +2257,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
default:
|
||||
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
|
||||
}
|
||||
update_cached_relsize(rnode, forknum, n_blocks);
|
||||
update_cached_relsize(reln->smgr_rnode.node, forknum, n_blocks);
|
||||
|
||||
elog(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln), RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
n_blocks);
|
||||
@@ -2311,7 +2275,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
* neon_db_size() -- Get the size of the database in bytes.
|
||||
*/
|
||||
int64
|
||||
neon_dbsize(Oid dbOid)
|
||||
neon_dbsize(Oid dbNode)
|
||||
{
|
||||
NeonResponse *resp;
|
||||
int64 db_size;
|
||||
@@ -2325,7 +2289,7 @@ neon_dbsize(Oid dbOid)
|
||||
.req.tag = T_NeonDbSizeRequest,
|
||||
.req.latest = latest,
|
||||
.req.lsn = request_lsn,
|
||||
.dbOid = dbOid,
|
||||
.dbNode = dbNode,
|
||||
};
|
||||
|
||||
resp = page_server_request(&request);
|
||||
@@ -2341,7 +2305,7 @@ neon_dbsize(Oid dbOid)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_IO_ERROR),
|
||||
errmsg("could not read db size of db %u from page server at lsn %X/%08X",
|
||||
dbOid,
|
||||
dbNode,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn),
|
||||
errdetail("page server returned error: %s",
|
||||
((NeonErrorResponse *) resp)->message)));
|
||||
@@ -2352,7 +2316,7 @@ neon_dbsize(Oid dbOid)
|
||||
}
|
||||
|
||||
elog(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
|
||||
dbOid,
|
||||
dbNode,
|
||||
(uint32) (request_lsn >> 32), (uint32) request_lsn,
|
||||
db_size);
|
||||
|
||||
@@ -2367,7 +2331,6 @@ void
|
||||
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
RelFileNode rnode = RelnGetRnode(reln);
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -2387,7 +2350,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
|
||||
}
|
||||
|
||||
set_cached_relsize(rnode, forknum, nblocks);
|
||||
set_cached_relsize(reln->smgr_rnode.node, forknum, nblocks);
|
||||
|
||||
/*
|
||||
* Truncating a relation drops all its buffers from the buffer cache
|
||||
@@ -2415,7 +2378,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
* for the extended pages, so there's no harm in leaving behind obsolete
|
||||
* entries for the truncated chunks.
|
||||
*/
|
||||
SetLastWrittenLSNForRelation(lsn, rnode, forknum);
|
||||
SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forknum);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (IS_LOCAL_REL(reln))
|
||||
@@ -2485,9 +2448,9 @@ neon_start_unlogged_build(SMgrRelation reln)
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("starting unlogged build of relation %u/%u/%u",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln))));
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode)));
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -2537,9 +2500,9 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln)
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("finishing phase 1 of unlogged build of relation %u/%u/%u",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln))));
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode)));
|
||||
|
||||
if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
return;
|
||||
@@ -2566,9 +2529,9 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
|
||||
ereport(SmgrTrace,
|
||||
(errmsg("ending unlogged build of relation %u/%u/%u",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln))));
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode)));
|
||||
|
||||
if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT)
|
||||
{
|
||||
@@ -2581,24 +2544,16 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT;
|
||||
|
||||
/* Remove local copy */
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
rnode.locator = RelnGetRnode(reln);
|
||||
#else
|
||||
rnode.node = RelnGetRnode(reln);
|
||||
#endif
|
||||
rnode = reln->smgr_rnode;
|
||||
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||
{
|
||||
elog(SmgrTrace, "forgetting cached relsize for %u/%u/%u.%u",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
rnode.node.spcNode,
|
||||
rnode.node.dbNode,
|
||||
rnode.node.relNode,
|
||||
forknum);
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
forget_cached_relsize(rnode.locator, forknum);
|
||||
#else
|
||||
forget_cached_relsize(rnode.node, forknum);
|
||||
#endif
|
||||
mdclose(reln, forknum);
|
||||
/* use isRedo == true, so that we drop it immediately */
|
||||
mdunlink(rnode, forknum, true);
|
||||
@@ -2751,16 +2706,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
|
||||
* regardless of whether the block is stored in shared buffers.
|
||||
* See also this function's top comment.
|
||||
*/
|
||||
|
||||
if (!OidIsValid(RnodeGetDbOid(rnode)))
|
||||
if (!OidIsValid(rnode.dbNode))
|
||||
return false;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&tag, &rnode, forknum, blkno);
|
||||
#else
|
||||
INIT_BUFFERTAG(tag, rnode, forknum, blkno);
|
||||
#endif
|
||||
|
||||
hash = BufTableHashCode(&tag);
|
||||
partitionLock = BufMappingPartitionLock(hash);
|
||||
|
||||
|
||||
@@ -15,11 +15,7 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#include "storage/relfilelocator.h"
|
||||
#else
|
||||
#include "storage/relfilenode.h"
|
||||
#endif
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/ipc.h"
|
||||
@@ -32,7 +28,6 @@
|
||||
#include "miscadmin.h"
|
||||
#endif
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
RelFileNode rnode;
|
||||
|
||||
@@ -788,7 +788,7 @@ ReconnectSafekeepers(void)
|
||||
|
||||
/*
|
||||
* Performs the logic for advancing the state machine of the specified safekeeper,
|
||||
* given that a certain set of events has occured.
|
||||
* given that a certain set of events has occurred.
|
||||
*/
|
||||
static void
|
||||
AdvancePollState(Safekeeper *sk, uint32 events)
|
||||
@@ -1394,12 +1394,7 @@ WalProposerRecovery(int donor, TimeLineID timeline, XLogRecPtr startpos, XLogRec
|
||||
WalReceiverConn *wrconn;
|
||||
WalRcvStreamOptions options;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
bool must_use_password = false;
|
||||
wrconn = walrcv_connect(safekeeper[donor].conninfo, false, must_use_password, "wal_proposer_recovery", &err);
|
||||
#else
|
||||
wrconn = walrcv_connect(safekeeper[donor].conninfo, false, "wal_proposer_recovery", &err);
|
||||
#endif
|
||||
if (!wrconn)
|
||||
{
|
||||
ereport(WARNING,
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
* message header */
|
||||
|
||||
/*
|
||||
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
|
||||
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
|
||||
* because all WL_* events are given flags equal to some (1 << i), starting from i = 0
|
||||
*/
|
||||
#define WL_NO_EVENTS 0
|
||||
@@ -317,7 +317,7 @@ typedef struct AppendResponse
|
||||
/* this is a criterion for walproposer --sync mode exit */
|
||||
XLogRecPtr commitLsn;
|
||||
HotStandbyFeedback hs;
|
||||
/* Feedback recieved from pageserver includes standby_status_update fields */
|
||||
/* Feedback received from pageserver includes standby_status_update fields */
|
||||
/* and custom neon feedback. */
|
||||
/* This part of the message is extensible. */
|
||||
PageserverFeedback rf;
|
||||
|
||||
@@ -26,10 +26,6 @@
|
||||
#include "access/xlogrecovery.h"
|
||||
#endif
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#include "utils/guc.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* These variables are used similarly to openLogFile/SegNo,
|
||||
* but for walproposer to write the XLOG during recovery. walpropFileTLI is the TimeLineID
|
||||
|
||||
@@ -128,11 +128,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
|
||||
else
|
||||
isvalid = false;
|
||||
bufferid = BufferDescriptorGetBuffer(bufHdr);
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
rnode = BufTagGetRelFileLocator(&bufHdr->tag);
|
||||
#else
|
||||
rnode = bufHdr->tag.rnode;
|
||||
#endif
|
||||
forknum = bufHdr->tag.forkNum;
|
||||
blocknum = bufHdr->tag.blockNum;
|
||||
|
||||
@@ -242,7 +238,7 @@ get_raw_page_at_lsn(PG_FUNCTION_ARGS)
|
||||
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
|
||||
raw_page_data = VARDATA(raw_page);
|
||||
|
||||
neon_read_at_lsn(RelnGetRnode(RelationGetSmgr(rel)), forknum, blkno, read_lsn, request_latest, raw_page_data);
|
||||
neon_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
|
||||
|
||||
relation_close(rel, AccessShareLock);
|
||||
|
||||
@@ -271,17 +267,11 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_NULL();
|
||||
|
||||
{
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
RelFileLocator rnode = {
|
||||
.spcOid = PG_GETARG_OID(0),
|
||||
.dbOid = PG_GETARG_OID(1),
|
||||
.relNumber = PG_GETARG_OID(2)};
|
||||
#else
|
||||
RelFileNode rnode = {
|
||||
.spcNode = PG_GETARG_OID(0),
|
||||
.dbNode = PG_GETARG_OID(1),
|
||||
.relNode = PG_GETARG_OID(2)};
|
||||
#endif
|
||||
|
||||
ForkNumber forknum = PG_GETARG_UINT32(3);
|
||||
|
||||
uint32 blkno = PG_GETARG_UINT32(4);
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
#include "access/xlog.h"
|
||||
#include "storage/block.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/relfilenode.h"
|
||||
#include "storage/smgr.h"
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
@@ -29,7 +30,6 @@
|
||||
|
||||
#include "inmem_smgr.h"
|
||||
|
||||
|
||||
/* Size of the in-memory smgr */
|
||||
#define MAX_PAGES 64
|
||||
|
||||
@@ -46,22 +46,12 @@ locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
|
||||
/* We only hold a small number of pages, so linear search */
|
||||
for (int i = 0; i < used_pages; i++)
|
||||
{
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
if (BufTagMatchesRelFileLocator(&page_tag[i], &reln->smgr_rlocator.locator)
|
||||
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
|
||||
&& forknum == page_tag[i].forkNum
|
||||
&& blkno == page_tag[i].blockNum)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
#else
|
||||
if (RelFileNodeEquals(RelnGetRnode(reln), page_tag[i].rnode)
|
||||
&& forknum == page_tag[i].forkNum
|
||||
&& blkno == page_tag[i].blockNum)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
@@ -107,12 +97,8 @@ inmem_exists(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
for (int i = 0; i < used_pages; i++)
|
||||
{
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
if (BufTagMatchesRelFileLocator(&page_tag[i], &reln->smgr_rlocator.locator)
|
||||
#else
|
||||
if (RelFileNodeEquals(RelnGetRnode(reln), page_tag[i].rnode)
|
||||
#endif
|
||||
&& forknum == page_tag[i].forkNum)
|
||||
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
|
||||
&& forknum == page_tag[i].forkNum)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
@@ -230,9 +216,9 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
*/
|
||||
elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
|
||||
"inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum,
|
||||
blocknum,
|
||||
used_pages);
|
||||
@@ -241,19 +227,14 @@ inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
|
||||
pg = used_pages;
|
||||
used_pages++;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&page_tag[pg], &RelnGetRnode(reln), forknum, blocknum);
|
||||
#else
|
||||
INIT_BUFFERTAG(page_tag[pg], RelnGetRnode(reln), forknum, blocknum);
|
||||
#endif
|
||||
INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
|
||||
}
|
||||
else
|
||||
{
|
||||
elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
|
||||
RelnGetSpcOid(reln),
|
||||
RelnGetDbOid(reln),
|
||||
RelnGetRelNumber(reln),
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum,
|
||||
blocknum,
|
||||
used_pages);
|
||||
|
||||
@@ -11,40 +11,6 @@
|
||||
#ifndef INMEM_SMGR_H
|
||||
#define INMEM_SMGR_H
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#include "storage/relfilelocator.h"
|
||||
#else
|
||||
#include "storage/relfilenode.h"
|
||||
#endif
|
||||
|
||||
// This is a hack to avoid too many ifdefs in the function definitions.
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
typedef RelFileLocator RelFileNode;
|
||||
typedef RelFileLocatorBackend RelFileNodeBackend;
|
||||
#define RelFileNodeBackendIsTemp RelFileLocatorBackendIsTemp
|
||||
#endif
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
#define RelnGetRnode(reln) (reln->smgr_rlocator.locator)
|
||||
#define RnodeGetSpcOid(rnode) (rnode.spcOid)
|
||||
#define RnodeGetDbOid(rnode) (rnode.dbOid)
|
||||
#define RnodeGetRelNumber(rnode) (rnode.relNumber)
|
||||
|
||||
#define BufTagGetRnode(tag) (BufTagGetRelFileLocator(&tag))
|
||||
#else
|
||||
#define RelnGetRnode(reln) (reln->smgr_rnode.node)
|
||||
#define RnodeGetSpcOid(rnode) (rnode.spcNode)
|
||||
#define RnodeGetDbOid(rnode) (rnode.dbNode)
|
||||
#define RnodeGetRelNumber(rnode) (rnode.relNode)
|
||||
|
||||
#define BufTagGetRnode(tag) (tag.rnode)
|
||||
|
||||
#endif
|
||||
|
||||
#define RelnGetSpcOid(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||
#define RelnGetDbOid(reln) (RnodeGetDbOid(RelnGetRnode(reln)))
|
||||
#define RelnGetRelNumber(reln) (RnodeGetRelNumber(RelnGetRnode(reln)))
|
||||
|
||||
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
|
||||
extern void smgr_init_inmem(void);
|
||||
|
||||
|
||||
@@ -62,10 +62,8 @@
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_GETRUSAGE
|
||||
#if PG_VERSION_NUM < 160000
|
||||
#include "rusagestub.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "access/clog.h"
|
||||
#include "access/commit_ts.h"
|
||||
@@ -119,7 +117,6 @@
|
||||
#include "neon_seccomp.h"
|
||||
#endif
|
||||
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
static int ReadRedoCommand(StringInfo inBuf);
|
||||
@@ -665,31 +662,18 @@ BeginRedoForBlock(StringInfo input_message)
|
||||
* BlockNumber
|
||||
*/
|
||||
forknum = pq_getmsgbyte(input_message);
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
rnode.spcOid = pq_getmsgint(input_message, 4);
|
||||
rnode.dbOid = pq_getmsgint(input_message, 4);
|
||||
rnode.relNumber = pq_getmsgint(input_message, 4);
|
||||
#else
|
||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||
#endif
|
||||
blknum = pq_getmsgint(input_message, 4);
|
||||
wal_redo_buffer = InvalidBuffer;
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
InitBufferTag(&target_redo_tag, &rnode, forknum, blknum);
|
||||
#else
|
||||
INIT_BUFFERTAG(target_redo_tag, rnode, forknum, blknum);
|
||||
#endif
|
||||
|
||||
|
||||
elog(TRACE, "BeginRedoForBlock %u/%u/%u.%d blk %u",
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
target_redo_tag.spcOid, target_redo_tag.dbOid, target_redo_tag.relNumber,
|
||||
#else
|
||||
target_redo_tag.rnode.spcNode, target_redo_tag.rnode.dbNode, target_redo_tag.rnode.relNode,
|
||||
#endif
|
||||
target_redo_tag.rnode.spcNode,
|
||||
target_redo_tag.rnode.dbNode,
|
||||
target_redo_tag.rnode.relNode,
|
||||
target_redo_tag.forkNum,
|
||||
target_redo_tag.blockNum);
|
||||
|
||||
@@ -725,15 +709,9 @@ PushPage(StringInfo input_message)
|
||||
* 8k page content
|
||||
*/
|
||||
forknum = pq_getmsgbyte(input_message);
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
rnode.spcOid = pq_getmsgint(input_message, 4);
|
||||
rnode.dbOid = pq_getmsgint(input_message, 4);
|
||||
rnode.relNumber = pq_getmsgint(input_message, 4);
|
||||
#else
|
||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||
#endif
|
||||
blknum = pq_getmsgint(input_message, 4);
|
||||
content = pq_getmsgbytes(input_message, BLCKSZ);
|
||||
|
||||
@@ -853,12 +831,7 @@ ApplyRecord(StringInfo input_message)
|
||||
*/
|
||||
if (BufferIsInvalid(wal_redo_buffer))
|
||||
{
|
||||
wal_redo_buffer = NeonRedoReadBuffer(
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
BufTagGetRelFileLocator(&target_redo_tag),
|
||||
#else
|
||||
target_redo_tag.rnode,
|
||||
#endif
|
||||
wal_redo_buffer = NeonRedoReadBuffer(target_redo_tag.rnode,
|
||||
target_redo_tag.forkNum,
|
||||
target_redo_tag.blockNum,
|
||||
RBM_NORMAL);
|
||||
@@ -900,43 +873,12 @@ apply_error_callback(void *arg)
|
||||
}
|
||||
|
||||
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
|
||||
static bool
|
||||
redo_block_filter(XLogReaderState *record, uint8 block_id)
|
||||
{
|
||||
BufferTag target_tag;
|
||||
|
||||
RelFileLocator rlocator;
|
||||
XLogRecGetBlockTag(record, block_id,
|
||||
&rlocator, &target_tag.forkNum, &target_tag.blockNum);
|
||||
|
||||
target_tag.spcOid = rlocator.spcOid;
|
||||
target_tag.dbOid = rlocator.dbOid;
|
||||
target_tag.relNumber = rlocator.relNumber;
|
||||
|
||||
/*
|
||||
* Can a WAL redo function ever access a relation other than the one that
|
||||
* it modifies? I don't see why it would.
|
||||
*/
|
||||
if (RelFileLocatorEquals(BufTagGetRelFileLocator(&target_tag), BufTagGetRelFileLocator(&target_redo_tag)))
|
||||
elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
|
||||
target_tag.spcOid, target_tag.dbOid, target_tag.relNumber,
|
||||
target_tag.forkNum, target_tag.blockNum);
|
||||
|
||||
/*
|
||||
* If this block isn't one we are currently restoring, then return 'true'
|
||||
* so that this gets ignored
|
||||
*/
|
||||
return !BufferTagsEqual(&target_tag, &target_redo_tag);
|
||||
}
|
||||
#else
|
||||
static bool
|
||||
redo_block_filter(XLogReaderState *record, uint8 block_id)
|
||||
{
|
||||
BufferTag target_tag;
|
||||
|
||||
|
||||
#if PG_VERSION_NUM >= 150000
|
||||
XLogRecGetBlockTag(record, block_id,
|
||||
&target_tag.rnode, &target_tag.forkNum, &target_tag.blockNum);
|
||||
@@ -955,18 +897,14 @@ redo_block_filter(XLogReaderState *record, uint8 block_id)
|
||||
*/
|
||||
if (!RelFileNodeEquals(target_tag.rnode, target_redo_tag.rnode))
|
||||
elog(WARNING, "REDO accessing unexpected page: %u/%u/%u.%u blk %u",
|
||||
target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode,
|
||||
target_tag.forkNum, target_tag.blockNum);
|
||||
target_tag.rnode.spcNode, target_tag.rnode.dbNode, target_tag.rnode.relNode, target_tag.forkNum, target_tag.blockNum);
|
||||
|
||||
/*
|
||||
* If this block isn't one we are currently restoring, then return 'true'
|
||||
* so that this gets ignored
|
||||
*/
|
||||
|
||||
return !BUFFERTAGS_EQUAL(target_tag, target_redo_tag);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Get a page image back from buffer cache.
|
||||
@@ -993,15 +931,9 @@ GetPage(StringInfo input_message)
|
||||
* BlockNumber
|
||||
*/
|
||||
forknum = pq_getmsgbyte(input_message);
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
rnode.spcOid = pq_getmsgint(input_message, 4);
|
||||
rnode.dbOid = pq_getmsgint(input_message, 4);
|
||||
rnode.relNumber = pq_getmsgint(input_message, 4);
|
||||
#else
|
||||
rnode.spcNode = pq_getmsgint(input_message, 4);
|
||||
rnode.dbNode = pq_getmsgint(input_message, 4);
|
||||
rnode.relNode = pq_getmsgint(input_message, 4);
|
||||
#endif
|
||||
blknum = pq_getmsgint(input_message, 4);
|
||||
|
||||
/* FIXME: check that we got a BeginRedoForBlock message or this earlier */
|
||||
@@ -1029,11 +961,7 @@ GetPage(StringInfo input_message)
|
||||
} while (tot_written < BLCKSZ);
|
||||
|
||||
ReleaseBuffer(buf);
|
||||
#if PG_VERSION_NUM >= 160000
|
||||
DropRelationAllLocalBuffers(rnode);
|
||||
#else
|
||||
DropRelFileNodeAllLocalBuffers(rnode);
|
||||
#endif
|
||||
wal_redo_buffer = InvalidBuffer;
|
||||
|
||||
elog(TRACE, "Page sent back for block %u", blknum);
|
||||
|
||||
260
poetry.lock
generated
260
poetry.lock
generated
@@ -2,60 +2,111 @@
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
version = "3.7.4"
|
||||
version = "3.8.5"
|
||||
description = "Async http client/server framework (asyncio)"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:6c8200abc9dc5f27203986100579fc19ccad7a832c07d2bc151ce4ff17190076"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:dd7936f2a6daa861143e376b3a1fb56e9b802f4980923594edd9ca5670974895"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:bc3d14bf71a3fb94e5acf5bbf67331ab335467129af6416a437bd6024e4f743d"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:8ec1a38074f68d66ccb467ed9a673a726bb397142c273f90d4ba954666e87d54"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:b84ad94868e1e6a5e30d30ec419956042815dfaea1b1df1cef623e4564c374d9"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:d5d102e945ecca93bcd9801a7bb2fa703e37ad188a2f81b1e65e4abe4b51b00c"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:c2a80fd9a8d7e41b4e38ea9fe149deed0d6aaede255c497e66b8213274d6d61b"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-win32.whl", hash = "sha256:481d4b96969fbfdcc3ff35eea5305d8565a8300410d3d269ccac69e7256b1329"},
|
||||
{file = "aiohttp-3.7.4-cp36-cp36m-win_amd64.whl", hash = "sha256:16d0683ef8a6d803207f02b899c928223eb219111bd52420ef3d7a8aa76227b6"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:eab51036cac2da8a50d7ff0ea30be47750547c9aa1aa2cf1a1b710a1827e7dbe"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:feb24ff1226beeb056e247cf2e24bba5232519efb5645121c4aea5b6ad74c1f2"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:119feb2bd551e58d83d1b38bfa4cb921af8ddedec9fad7183132db334c3133e0"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:6ca56bdfaf825f4439e9e3673775e1032d8b6ea63b8953d3812c71bd6a8b81de"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:5563ad7fde451b1986d42b9bb9140e2599ecf4f8e42241f6da0d3d624b776f40"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:62bc216eafac3204877241569209d9ba6226185aa6d561c19159f2e1cbb6abfb"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:f4496d8d04da2e98cc9133e238ccebf6a13ef39a93da2e87146c8c8ac9768242"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-win32.whl", hash = "sha256:2ffea7904e70350da429568113ae422c88d2234ae776519549513c8f217f58a9"},
|
||||
{file = "aiohttp-3.7.4-cp37-cp37m-win_amd64.whl", hash = "sha256:5e91e927003d1ed9283dee9abcb989334fc8e72cf89ebe94dc3e07e3ff0b11e9"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:4c1bdbfdd231a20eee3e56bd0ac1cd88c4ff41b64ab679ed65b75c9c74b6c5c2"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-manylinux1_i686.whl", hash = "sha256:71680321a8a7176a58dfbc230789790639db78dad61a6e120b39f314f43f1907"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:7dbd087ff2f4046b9b37ba28ed73f15fd0bc9f4fdc8ef6781913da7f808d9536"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:dee68ec462ff10c1d836c0ea2642116aba6151c6880b688e56b4c0246770f297"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:99c5a5bf7135607959441b7d720d96c8e5c46a1f96e9d6d4c9498be8d5f24212"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:5dde6d24bacac480be03f4f864e9a67faac5032e28841b00533cd168ab39cad9"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:418597633b5cd9639e514b1d748f358832c08cd5d9ef0870026535bd5eaefdd0"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-win32.whl", hash = "sha256:e76e78863a4eaec3aee5722d85d04dcbd9844bc6cd3bfa6aa880ff46ad16bfcb"},
|
||||
{file = "aiohttp-3.7.4-cp38-cp38-win_amd64.whl", hash = "sha256:950b7ef08b2afdab2488ee2edaff92a03ca500a48f1e1aaa5900e73d6cf992bc"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:2eb3efe243e0f4ecbb654b08444ae6ffab37ac0ef8f69d3a2ffb958905379daf"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-manylinux1_i686.whl", hash = "sha256:822bd4fd21abaa7b28d65fc9871ecabaddc42767884a626317ef5b75c20e8a2d"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:58c62152c4c8731a3152e7e650b29ace18304d086cb5552d317a54ff2749d32a"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:7c7820099e8b3171e54e7eedc33e9450afe7cd08172632d32128bd527f8cb77d"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:5b50e0b9460100fe05d7472264d1975f21ac007b35dcd6fd50279b72925a27f4"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:c44d3c82a933c6cbc21039326767e778eface44fca55c65719921c4b9661a3f7"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cc31e906be1cc121ee201adbdf844522ea3349600dd0a40366611ca18cd40e81"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-win32.whl", hash = "sha256:fbd3b5e18d34683decc00d9a360179ac1e7a320a5fee10ab8053ffd6deab76e0"},
|
||||
{file = "aiohttp-3.7.4-cp39-cp39-win_amd64.whl", hash = "sha256:40bd1b101b71a18a528ffce812cc14ff77d4a2a1272dfb8b11b200967489ef3e"},
|
||||
{file = "aiohttp-3.7.4.tar.gz", hash = "sha256:5d84ecc73141d0a0d61ece0742bb7ff5751b0657dab8405f899d3ceb104cc7de"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"},
|
||||
{file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"},
|
||||
{file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"},
|
||||
{file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"},
|
||||
{file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"},
|
||||
{file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"},
|
||||
{file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"},
|
||||
{file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
async-timeout = ">=3.0,<4.0"
|
||||
aiosignal = ">=1.1.2"
|
||||
async-timeout = ">=4.0.0a3,<5.0"
|
||||
attrs = ">=17.3.0"
|
||||
chardet = ">=2.0,<4.0"
|
||||
charset-normalizer = ">=2.0,<4.0"
|
||||
frozenlist = ">=1.1.1"
|
||||
multidict = ">=4.5,<7.0"
|
||||
typing-extensions = ">=3.6.5"
|
||||
yarl = ">=1.0,<2.0"
|
||||
|
||||
[package.extras]
|
||||
speedups = ["aiodns", "brotlipy", "cchardet"]
|
||||
speedups = ["Brotli", "aiodns", "cchardet"]
|
||||
|
||||
[[package]]
|
||||
name = "aiopg"
|
||||
@@ -75,6 +126,20 @@ psycopg2-binary = ">=2.8.4"
|
||||
[package.extras]
|
||||
sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
|
||||
|
||||
[[package]]
|
||||
name = "aiosignal"
|
||||
version = "1.3.1"
|
||||
description = "aiosignal: a list of registered asynchronous callbacks"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
|
||||
{file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
frozenlist = ">=1.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "allure-pytest"
|
||||
version = "2.13.2"
|
||||
@@ -107,13 +172,13 @@ pluggy = ">=0.4.0"
|
||||
|
||||
[[package]]
|
||||
name = "async-timeout"
|
||||
version = "3.0.1"
|
||||
version = "4.0.2"
|
||||
description = "Timeout context manager for asyncio programs"
|
||||
optional = false
|
||||
python-versions = ">=3.5.3"
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "async-timeout-3.0.1.tar.gz", hash = "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f"},
|
||||
{file = "async_timeout-3.0.1-py3-none-any.whl", hash = "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"},
|
||||
{file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
|
||||
{file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -675,13 +740,13 @@ typing-extensions = ">=4.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2022.12.7"
|
||||
version = "2023.7.22"
|
||||
description = "Python package for providing Mozilla's CA Bundle."
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
|
||||
{file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
|
||||
{file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
|
||||
{file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -781,17 +846,6 @@ networkx = ">=2.4,<3.0"
|
||||
pyyaml = ">5.4"
|
||||
sarif-om = ">=1.0.4,<1.1.0"
|
||||
|
||||
[[package]]
|
||||
name = "chardet"
|
||||
version = "3.0.4"
|
||||
description = "Universal encoding detector for Python 2 and 3"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
|
||||
{file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "2.1.0"
|
||||
@@ -980,6 +1034,76 @@ files = [
|
||||
Flask = ">=0.9"
|
||||
Six = "*"
|
||||
|
||||
[[package]]
|
||||
name = "frozenlist"
|
||||
version = "1.4.0"
|
||||
description = "A list-like structure which implements collections.abc.MutableSequence"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"},
|
||||
{file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"},
|
||||
{file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"},
|
||||
{file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"},
|
||||
{file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"},
|
||||
{file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "graphql-core"
|
||||
version = "3.2.1"
|
||||
@@ -1868,6 +1992,20 @@ files = [
|
||||
packaging = ">=17.1"
|
||||
pytest = ">=5.3"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-split"
|
||||
version = "0.8.1"
|
||||
description = "Pytest plugin which splits the test suite to equally sized sub suites based on test execution time."
|
||||
optional = false
|
||||
python-versions = ">=3.7.1,<4.0"
|
||||
files = [
|
||||
{file = "pytest_split-0.8.1-py3-none-any.whl", hash = "sha256:74b110ea091bd147cc1c5f9665a59506e5cedfa66f96a89fb03e4ab447c2c168"},
|
||||
{file = "pytest_split-0.8.1.tar.gz", hash = "sha256:2d88bd3dc528689a7a3f58fc12ea165c3aa62e90795e420dfad920afe5612d6d"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pytest = ">=5,<8"
|
||||
|
||||
[[package]]
|
||||
name = "pytest-timeout"
|
||||
version = "2.1.0"
|
||||
@@ -2513,4 +2651,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "fe771b153ef7e308d6d04421d0eb3f97d00780882277d2b4fc1f296054d8db79"
|
||||
content-hash = "c40f62277e788011920f4edb6f7392046ee440f792a104c903097415def9a916"
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use super::AuthSuccess;
|
||||
use crate::{
|
||||
auth::{self, AuthFlow, ClientCredentials},
|
||||
compute,
|
||||
console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
|
||||
proxy::{try_wake, NUM_RETRIES_CONNECT},
|
||||
sasl, scram,
|
||||
stream::PqStream,
|
||||
};
|
||||
@@ -48,7 +51,15 @@ pub(super) async fn authenticate(
|
||||
}
|
||||
};
|
||||
|
||||
let mut node = api.wake_compute(extra, creds).await?;
|
||||
let mut num_retries = 0;
|
||||
let mut node = loop {
|
||||
num_retries += 1;
|
||||
match try_wake(api, extra, creds).await? {
|
||||
ControlFlow::Break(n) => break n,
|
||||
ControlFlow::Continue(_) if num_retries < NUM_RETRIES_CONNECT => continue,
|
||||
ControlFlow::Continue(e) => return Err(e.into()),
|
||||
}
|
||||
};
|
||||
if let Some(keys) = scram_keys {
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
node.config.auth_keys(AuthKeys::ScramSha256(keys));
|
||||
|
||||
@@ -48,6 +48,14 @@ impl ClientCredentials<'_> {
|
||||
}
|
||||
|
||||
impl<'a> ClientCredentials<'a> {
|
||||
#[cfg(test)]
|
||||
pub fn new_noop() -> Self {
|
||||
ClientCredentials {
|
||||
user: "",
|
||||
project: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse(
|
||||
params: &'a StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
|
||||
@@ -262,24 +262,21 @@ pub mod timed_lru {
|
||||
token: Option<(C, C::LookupInfo<C::Key>)>,
|
||||
|
||||
/// The value itself.
|
||||
pub value: C::Value,
|
||||
value: C::Value,
|
||||
}
|
||||
|
||||
impl<C: Cache> Cached<C> {
|
||||
/// Place any entry into this wrapper; invalidation will be a no-op.
|
||||
/// Unfortunately, rust doesn't let us implement [`From`] or [`Into`].
|
||||
pub fn new_uncached(value: impl Into<C::Value>) -> Self {
|
||||
Self {
|
||||
token: None,
|
||||
value: value.into(),
|
||||
}
|
||||
pub fn new_uncached(value: C::Value) -> Self {
|
||||
Self { token: None, value }
|
||||
}
|
||||
|
||||
/// Drop this entry from a cache if it's still there.
|
||||
pub fn invalidate(&self) {
|
||||
pub fn invalidate(self) -> C::Value {
|
||||
if let Some((cache, info)) = &self.token {
|
||||
cache.invalidate(info);
|
||||
}
|
||||
self.value
|
||||
}
|
||||
|
||||
/// Tell if this entry is actually cached.
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
|
||||
use crate::{
|
||||
auth::parse_endpoint_param,
|
||||
cancellation::CancelClosure,
|
||||
console::errors::WakeComputeError,
|
||||
error::{io_error, UserFacingError},
|
||||
};
|
||||
use futures::{FutureExt, TryFutureExt};
|
||||
use itertools::Itertools;
|
||||
use pq_proto::StartupMessageParams;
|
||||
@@ -24,6 +29,12 @@ pub enum ConnectionError {
|
||||
TlsError(#[from] native_tls::Error),
|
||||
}
|
||||
|
||||
impl From<WakeComputeError> for ConnectionError {
|
||||
fn from(value: WakeComputeError) -> Self {
|
||||
io_error(value).into()
|
||||
}
|
||||
}
|
||||
|
||||
impl UserFacingError for ConnectionError {
|
||||
fn to_string_client(&self) -> String {
|
||||
use ConnectionError::*;
|
||||
|
||||
@@ -14,6 +14,7 @@ pub mod errors {
|
||||
use crate::{
|
||||
error::{io_error, UserFacingError},
|
||||
http,
|
||||
proxy::ShouldRetry,
|
||||
};
|
||||
use thiserror::Error;
|
||||
|
||||
@@ -72,6 +73,24 @@ pub mod errors {
|
||||
}
|
||||
}
|
||||
|
||||
impl ShouldRetry for ApiError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
// retry some transport errors
|
||||
Self::Transport(io) => io.could_retry(),
|
||||
// retry some temporary failures because the compute was in a bad state
|
||||
// (bad request can be returned when the endpoint was in transition)
|
||||
Self::Console {
|
||||
status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
|
||||
..
|
||||
} => true,
|
||||
// retry server errors
|
||||
Self::Console { status, .. } if status.is_server_error() => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<reqwest::Error> for ApiError {
|
||||
fn from(e: reqwest::Error) -> Self {
|
||||
io_error(e).into()
|
||||
@@ -186,14 +205,14 @@ pub trait Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ClientCredentials,
|
||||
) -> Result<Option<AuthInfo>, errors::GetAuthInfoError>;
|
||||
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ClientCredentials,
|
||||
) -> Result<CachedNodeInfo, errors::WakeComputeError>;
|
||||
}
|
||||
|
||||
|
||||
@@ -106,7 +106,7 @@ impl super::Api for Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ClientCredentials,
|
||||
) -> Result<Option<AuthInfo>, GetAuthInfoError> {
|
||||
self.do_get_auth_info(creds).await
|
||||
}
|
||||
@@ -115,7 +115,7 @@ impl super::Api for Api {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
_extra: &ConsoleReqExtra<'_>,
|
||||
_creds: &ClientCredentials<'_>,
|
||||
_creds: &ClientCredentials,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
self.do_wake_compute()
|
||||
.map_ok(CachedNodeInfo::new_uncached)
|
||||
|
||||
@@ -123,7 +123,7 @@ impl super::Api for Api {
|
||||
async fn get_auth_info(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ClientCredentials,
|
||||
) -> Result<Option<AuthInfo>, GetAuthInfoError> {
|
||||
self.do_get_auth_info(extra, creds).await
|
||||
}
|
||||
@@ -132,7 +132,7 @@ impl super::Api for Api {
|
||||
async fn wake_compute(
|
||||
&self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
creds: &ClientCredentials<'_>,
|
||||
creds: &ClientCredentials,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
let key = creds.project().expect("impossible");
|
||||
|
||||
|
||||
@@ -1,19 +1,17 @@
|
||||
use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use parking_lot::Mutex;
|
||||
use pq_proto::StartupMessageParams;
|
||||
use std::fmt;
|
||||
use std::ops::ControlFlow;
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use tokio::time;
|
||||
|
||||
use crate::config;
|
||||
use crate::{auth, console};
|
||||
use crate::{compute, config};
|
||||
|
||||
use super::sql_over_http::MAX_RESPONSE_SIZE;
|
||||
|
||||
use crate::proxy::{
|
||||
can_retry_tokio_postgres_error, invalidate_cache, retry_after, try_wake,
|
||||
NUM_RETRIES_WAKE_COMPUTE,
|
||||
};
|
||||
use crate::proxy::ConnectMechanism;
|
||||
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
@@ -187,6 +185,27 @@ impl GlobalConnPool {
|
||||
}
|
||||
}
|
||||
|
||||
struct TokioMechanism<'a> {
|
||||
conn_info: &'a ConnInfo,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TokioMechanism<'_> {
|
||||
type Connection = tokio_postgres::Client;
|
||||
type ConnectError = tokio_postgres::Error;
|
||||
type Error = anyhow::Error;
|
||||
|
||||
async fn connect_once(
|
||||
&self,
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError> {
|
||||
connect_to_compute_once(node_info, self.conn_info, timeout).await
|
||||
}
|
||||
|
||||
fn update_connect_config(&self, _config: &mut compute::ConnCfg) {}
|
||||
}
|
||||
|
||||
// Wake up the destination if needed. Code here is a bit involved because
|
||||
// we reuse the code from the usual proxy and we need to prepare few structures
|
||||
// that this code expects.
|
||||
@@ -220,72 +239,18 @@ async fn connect_to_compute(
|
||||
application_name: Some(APP_NAME),
|
||||
};
|
||||
|
||||
let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
|
||||
let node_info = creds
|
||||
.wake_compute(&extra)
|
||||
.await?
|
||||
.context("missing cache entry from wake_compute")?;
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut wait_duration = time::Duration::ZERO;
|
||||
let mut should_wake_with_error = None;
|
||||
loop {
|
||||
if !wait_duration.is_zero() {
|
||||
time::sleep(wait_duration).await;
|
||||
}
|
||||
|
||||
// try wake the compute node if we have determined it's sensible to do so
|
||||
if let Some(err) = should_wake_with_error.take() {
|
||||
match try_wake(node_info, &extra, &creds).await {
|
||||
// we can't wake up the compute node
|
||||
Ok(None) => return Err(err),
|
||||
// there was an error communicating with the control plane
|
||||
Err(e) => return Err(e.into()),
|
||||
// failed to wake up but we can continue to retry
|
||||
Ok(Some(ControlFlow::Continue(()))) => {
|
||||
wait_duration = retry_after(num_retries);
|
||||
should_wake_with_error = Some(err);
|
||||
|
||||
num_retries += 1;
|
||||
info!(num_retries, "retrying wake compute");
|
||||
continue;
|
||||
}
|
||||
// successfully woke up a compute node and can break the wakeup loop
|
||||
Ok(Some(ControlFlow::Break(()))) => {}
|
||||
}
|
||||
}
|
||||
|
||||
match connect_to_compute_once(node_info, conn_info).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Err(e) => {
|
||||
error!(error = ?e, "could not connect to compute node");
|
||||
if !can_retry_error(&e, num_retries) {
|
||||
return Err(e.into());
|
||||
}
|
||||
wait_duration = retry_after(num_retries);
|
||||
|
||||
// after the first connect failure,
|
||||
// we should invalidate the cache and wake up a new compute node
|
||||
if num_retries == 0 {
|
||||
invalidate_cache(node_info);
|
||||
should_wake_with_error = Some(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
num_retries += 1;
|
||||
info!(num_retries, "retrying connect");
|
||||
}
|
||||
}
|
||||
|
||||
fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
|
||||
match err {
|
||||
// retry all errors at least once
|
||||
_ if num_retries == 0 => true,
|
||||
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
|
||||
err => can_retry_tokio_postgres_error(err),
|
||||
}
|
||||
crate::proxy::connect_to_compute(&TokioMechanism { conn_info }, node_info, &extra, &creds).await
|
||||
}
|
||||
|
||||
async fn connect_to_compute_once(
|
||||
node_info: &console::CachedNodeInfo,
|
||||
conn_info: &ConnInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
|
||||
@@ -294,6 +259,7 @@ async fn connect_to_compute_once(
|
||||
.password(&conn_info.password)
|
||||
.dbname(&conn_info.dbname)
|
||||
.max_backend_message_size(MAX_RESPONSE_SIZE)
|
||||
.connect_timeout(timeout)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ use serde_json::Map;
|
||||
use serde_json::Value;
|
||||
use tokio_postgres::types::Kind;
|
||||
use tokio_postgres::types::Type;
|
||||
use tokio_postgres::GenericClient;
|
||||
use tokio_postgres::Row;
|
||||
use url::Url;
|
||||
|
||||
@@ -23,6 +24,13 @@ struct QueryData {
|
||||
params: Vec<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
#[serde(untagged)]
|
||||
enum Payload {
|
||||
Single(QueryData),
|
||||
Batch(Vec<QueryData>),
|
||||
}
|
||||
|
||||
pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
|
||||
const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
|
||||
|
||||
@@ -192,15 +200,53 @@ pub async fn handle(
|
||||
// Read the query and query params from the request body
|
||||
//
|
||||
let body = hyper::body::to_bytes(request.into_body()).await?;
|
||||
let QueryData { query, params } = serde_json::from_slice(&body)?;
|
||||
let query_params = json_to_pg_text(params)?;
|
||||
let payload: Payload = serde_json::from_slice(&body)?;
|
||||
|
||||
let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
|
||||
|
||||
//
|
||||
// Now execute the query and return the result
|
||||
//
|
||||
let client = conn_pool.get(&conn_info, !allow_pool).await?;
|
||||
let result = match payload {
|
||||
Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode).await,
|
||||
Payload::Batch(queries) => {
|
||||
let mut results = Vec::new();
|
||||
let transaction = client.transaction().await?;
|
||||
for query in queries {
|
||||
let result = query_to_json(&transaction, query, raw_output, array_mode).await;
|
||||
match result {
|
||||
Ok(r) => results.push(r),
|
||||
Err(e) => {
|
||||
transaction.rollback().await?;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
transaction.commit().await?;
|
||||
Ok(json!({ "results": results }))
|
||||
}
|
||||
};
|
||||
|
||||
let row_stream = client.query_raw_txt(query, query_params).await?;
|
||||
if allow_pool {
|
||||
// return connection to the pool
|
||||
tokio::task::spawn(async move {
|
||||
let _ = conn_pool.put(&conn_info, client).await;
|
||||
});
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
async fn query_to_json<T: GenericClient>(
|
||||
client: &T,
|
||||
data: QueryData,
|
||||
raw_output: bool,
|
||||
array_mode: bool,
|
||||
) -> anyhow::Result<Value> {
|
||||
let query_params = json_to_pg_text(data.params)?;
|
||||
let row_stream = client
|
||||
.query_raw_txt::<String, _>(data.query, query_params)
|
||||
.await?;
|
||||
|
||||
// Manually drain the stream into a vector to leave row_stream hanging
|
||||
// around to get a command tag. Also check that the response is not too
|
||||
@@ -256,13 +302,6 @@ pub async fn handle(
|
||||
.map(|row| pg_text_row_to_json(row, raw_output, array_mode))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
|
||||
if allow_pool {
|
||||
// return connection to the pool
|
||||
tokio::task::spawn(async move {
|
||||
let _ = conn_pool.put(&conn_info, client).await;
|
||||
});
|
||||
}
|
||||
|
||||
// resulting JSON format is based on the format of node-postgres result
|
||||
Ok(json!({
|
||||
"command": command_tag_name,
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use crate::{
|
||||
cancellation::CancelMap, config::ProxyConfig, error::io_error, proxy::handle_ws_client,
|
||||
cancellation::CancelMap,
|
||||
config::ProxyConfig,
|
||||
error::io_error,
|
||||
proxy::{handle_client, ClientMode},
|
||||
};
|
||||
use bytes::{Buf, Bytes};
|
||||
use futures::{Sink, Stream, StreamExt};
|
||||
@@ -150,12 +153,12 @@ async fn serve_websocket(
|
||||
hostname: Option<String>,
|
||||
) -> anyhow::Result<()> {
|
||||
let websocket = websocket.await?;
|
||||
handle_ws_client(
|
||||
handle_client(
|
||||
config,
|
||||
cancel_map,
|
||||
session_id,
|
||||
WebSocketRw::new(websocket),
|
||||
hostname,
|
||||
ClientMode::Websockets { hostname },
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
@@ -178,13 +181,15 @@ async fn ws_handler(
|
||||
|
||||
// Check if the request is a websocket upgrade request.
|
||||
if hyper_tungstenite::is_upgrade_request(&request) {
|
||||
info!(session_id = ?session_id, "performing websocket upgrade");
|
||||
|
||||
let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
|
||||
{
|
||||
error!("error in websocket connection: {e:?}");
|
||||
error!(session_id = ?session_id, "error in websocket connection: {e:?}");
|
||||
}
|
||||
});
|
||||
|
||||
@@ -221,6 +226,18 @@ async fn ws_handler(
|
||||
);
|
||||
r
|
||||
})
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
|
||||
Response::builder()
|
||||
.header("Allow", "OPTIONS, POST")
|
||||
.header("Access-Control-Allow-Origin", "*")
|
||||
.header(
|
||||
"Access-Control-Allow-Headers",
|
||||
"Neon-Connection-String, Neon-Raw-Text-Output, Neon-Array-Mode, Neon-Pool-Opt-In",
|
||||
)
|
||||
.header("Access-Control-Max-Age", "86400" /* 24 hours */)
|
||||
.status(StatusCode::OK) // 204 is also valid, but see: https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/OPTIONS#status_code
|
||||
.body(Body::empty())
|
||||
.map_err(|e| ApiError::BadRequest(e.into()))
|
||||
} else {
|
||||
json_response(StatusCode::BAD_REQUEST, "query is not supported")
|
||||
}
|
||||
|
||||
@@ -6,21 +6,18 @@ use crate::{
|
||||
cancellation::{self, CancelMap},
|
||||
compute::{self, PostgresConnection},
|
||||
config::{ProxyConfig, TlsConfig},
|
||||
console::{
|
||||
self,
|
||||
errors::{ApiError, WakeComputeError},
|
||||
messages::MetricsAuxInfo,
|
||||
},
|
||||
error::io_error,
|
||||
console::{self, errors::WakeComputeError, messages::MetricsAuxInfo},
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
use anyhow::{bail, Context};
|
||||
use async_trait::async_trait;
|
||||
use futures::TryFutureExt;
|
||||
use hyper::StatusCode;
|
||||
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
|
||||
use metrics::{
|
||||
exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
|
||||
use std::{error::Error, ops::ControlFlow, sync::Arc};
|
||||
use std::{error::Error, io, ops::ControlFlow, sync::Arc};
|
||||
use tokio::{
|
||||
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
|
||||
time,
|
||||
@@ -31,24 +28,37 @@ use utils::measured_stream::MeasuredStream;
|
||||
|
||||
/// Number of times we should retry the `/proxy_wake_compute` http request.
|
||||
/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
|
||||
pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
|
||||
pub const NUM_RETRIES_CONNECT: u32 = 10;
|
||||
const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
|
||||
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
|
||||
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
const ERR_PROTO_VIOLATION: &str = "protocol violation";
|
||||
|
||||
static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_accepted_connections_total",
|
||||
"Number of TCP client connections accepted."
|
||||
"Number of TCP client connections accepted.",
|
||||
&["protocol"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"proxy_closed_connections_total",
|
||||
"Number of TCP client connections closed."
|
||||
"Number of TCP client connections closed.",
|
||||
&["protocol"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"proxy_compute_connection_latency_seconds",
|
||||
"Time it took for proxy to establish a connection to the compute endpoint",
|
||||
// largest bucket = 2^16 * 0.5ms = 32s
|
||||
exponential_buckets(0.0005, 2.0, 16).unwrap(),
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
@@ -103,7 +113,8 @@ pub async fn task_main(
|
||||
.set_nodelay(true)
|
||||
.context("failed to set socket option")?;
|
||||
|
||||
handle_client(config, &cancel_map, session_id, socket).await
|
||||
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp)
|
||||
.await
|
||||
}
|
||||
.unwrap_or_else(move |e| {
|
||||
// Acknowledge that the task has finished with an error.
|
||||
@@ -128,26 +139,74 @@ pub async fn task_main(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// TODO(tech debt): unite this with its twin below.
|
||||
pub enum ClientMode {
|
||||
Tcp,
|
||||
Websockets { hostname: Option<String> },
|
||||
}
|
||||
|
||||
/// Abstracts the logic of handling TCP vs WS clients
|
||||
impl ClientMode {
|
||||
fn protocol_label(&self) -> &'static str {
|
||||
match self {
|
||||
ClientMode::Tcp => "tcp",
|
||||
ClientMode::Websockets { .. } => "ws",
|
||||
}
|
||||
}
|
||||
|
||||
fn allow_cleartext(&self) -> bool {
|
||||
match self {
|
||||
ClientMode::Tcp => false,
|
||||
ClientMode::Websockets { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
|
||||
match self {
|
||||
ClientMode::Tcp => config.allow_self_signed_compute,
|
||||
ClientMode::Websockets { .. } => false,
|
||||
}
|
||||
}
|
||||
|
||||
fn hostname<'a, S>(&'a self, s: &'a Stream<S>) -> Option<&'a str> {
|
||||
match self {
|
||||
ClientMode::Tcp => s.sni_hostname(),
|
||||
ClientMode::Websockets { hostname } => hostname.as_deref(),
|
||||
}
|
||||
}
|
||||
|
||||
fn handshake_tls<'a>(&self, tls: Option<&'a TlsConfig>) -> Option<&'a TlsConfig> {
|
||||
match self {
|
||||
ClientMode::Tcp => tls,
|
||||
// TLS is None here if using websockets, because the connection is already encrypted.
|
||||
ClientMode::Websockets { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
pub async fn handle_ws_client(
|
||||
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
hostname: Option<String>,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
protocol = mode.protocol_label(),
|
||||
"handling interactive connection from client"
|
||||
);
|
||||
|
||||
// The `closed` counter will increase when this future is destroyed.
|
||||
NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
|
||||
NUM_CONNECTIONS_ACCEPTED_COUNTER
|
||||
.with_label_values(&[mode.protocol_label()])
|
||||
.inc();
|
||||
scopeguard::defer! {
|
||||
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
|
||||
NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
|
||||
}
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
let hostname = hostname.as_deref();
|
||||
|
||||
// TLS is None here, because the connection is already encrypted.
|
||||
let do_handshake = handshake(stream, None, cancel_map);
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls), cancel_map);
|
||||
let (mut stream, params) = match do_handshake.await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(()), // it's a cancellation request
|
||||
@@ -155,6 +214,7 @@ pub async fn handle_ws_client(
|
||||
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let hostname = mode.hostname(stream.get_ref());
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
let result = config
|
||||
.auth_backend
|
||||
@@ -168,59 +228,15 @@ pub async fn handle_ws_client(
|
||||
}
|
||||
};
|
||||
|
||||
let client = Client::new(stream, creds, ¶ms, session_id, false);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session, true))
|
||||
.await
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
async fn handle_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
session_id: uuid::Uuid,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
// The `closed` counter will increase when this future is destroyed.
|
||||
NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
|
||||
scopeguard::defer! {
|
||||
NUM_CONNECTIONS_CLOSED_COUNTER.inc();
|
||||
}
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
let do_handshake = handshake(stream, tls, cancel_map);
|
||||
let (mut stream, params) = match do_handshake.await? {
|
||||
Some(x) => x,
|
||||
None => return Ok(()), // it's a cancellation request
|
||||
};
|
||||
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let sni = stream.get_ref().sni_hostname();
|
||||
let common_names = tls.and_then(|tls| tls.common_names.clone());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_names))
|
||||
.transpose();
|
||||
|
||||
match result {
|
||||
Ok(creds) => creds,
|
||||
Err(e) => stream.throw_error(e).await?,
|
||||
}
|
||||
};
|
||||
|
||||
let allow_self_signed_compute = config.allow_self_signed_compute;
|
||||
|
||||
let client = Client::new(
|
||||
stream,
|
||||
creds,
|
||||
¶ms,
|
||||
session_id,
|
||||
allow_self_signed_compute,
|
||||
mode.allow_self_signed_compute(config),
|
||||
);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(session, false))
|
||||
.with_session(|session| client.connect_to_db(session, mode.allow_cleartext()))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -303,18 +319,18 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
/// (e.g. the compute node's address might've changed at the wrong time).
|
||||
/// Invalidate the cache entry (if any) to prevent subsequent errors.
|
||||
#[tracing::instrument(name = "invalidate_cache", skip_all)]
|
||||
pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
|
||||
pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
|
||||
let is_cached = node_info.cached();
|
||||
if is_cached {
|
||||
warn!("invalidating stalled compute node info cache entry");
|
||||
node_info.invalidate();
|
||||
}
|
||||
|
||||
let label = match is_cached {
|
||||
true => "compute_cached",
|
||||
false => "compute_uncached",
|
||||
};
|
||||
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
|
||||
|
||||
node_info.invalidate().config
|
||||
}
|
||||
|
||||
/// Try to connect to the compute node once.
|
||||
@@ -331,157 +347,208 @@ async fn connect_to_compute_once(
|
||||
.await
|
||||
}
|
||||
|
||||
enum ConnectionState<E> {
|
||||
Cached(console::CachedNodeInfo),
|
||||
Invalid(compute::ConnCfg, E),
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait ConnectMechanism {
|
||||
type Connection;
|
||||
type ConnectError;
|
||||
type Error: From<Self::ConnectError>;
|
||||
async fn connect_once(
|
||||
&self,
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError>;
|
||||
|
||||
fn update_connect_config(&self, conf: &mut compute::ConnCfg);
|
||||
}
|
||||
|
||||
pub struct TcpMechanism<'a> {
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
pub params: &'a StartupMessageParams,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TcpMechanism<'_> {
|
||||
type Connection = PostgresConnection;
|
||||
type ConnectError = compute::ConnectionError;
|
||||
type Error = compute::ConnectionError;
|
||||
|
||||
async fn connect_once(
|
||||
&self,
|
||||
node_info: &console::CachedNodeInfo,
|
||||
timeout: time::Duration,
|
||||
) -> Result<PostgresConnection, Self::Error> {
|
||||
connect_to_compute_once(node_info, timeout).await
|
||||
}
|
||||
|
||||
fn update_connect_config(&self, config: &mut compute::ConnCfg) {
|
||||
config.set_startup_params(self.params);
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to connect to the compute node, retrying if necessary.
|
||||
/// This function might update `node_info`, so we take it by `&mut`.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn connect_to_compute(
|
||||
node_info: &mut console::CachedNodeInfo,
|
||||
params: &StartupMessageParams,
|
||||
pub async fn connect_to_compute<M: ConnectMechanism>(
|
||||
mechanism: &M,
|
||||
mut node_info: console::CachedNodeInfo,
|
||||
extra: &console::ConsoleReqExtra<'_>,
|
||||
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
|
||||
) -> Result<PostgresConnection, compute::ConnectionError> {
|
||||
) -> Result<M::Connection, M::Error>
|
||||
where
|
||||
M::ConnectError: ShouldRetry + std::fmt::Debug,
|
||||
M::Error: From<WakeComputeError>,
|
||||
{
|
||||
let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
|
||||
|
||||
mechanism.update_connect_config(&mut node_info.config);
|
||||
|
||||
let mut num_retries = 0;
|
||||
let mut wait_duration = time::Duration::ZERO;
|
||||
let mut should_wake_with_error = None;
|
||||
let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
|
||||
|
||||
loop {
|
||||
// Apply startup params to the (possibly, cached) compute node info.
|
||||
node_info.config.set_startup_params(params);
|
||||
match state {
|
||||
ConnectionState::Invalid(config, err) => {
|
||||
let wake_res = match creds {
|
||||
auth::BackendType::Console(api, creds) => {
|
||||
try_wake(api.as_ref(), extra, creds).await
|
||||
}
|
||||
auth::BackendType::Postgres(api, creds) => {
|
||||
try_wake(api.as_ref(), extra, creds).await
|
||||
}
|
||||
// nothing to do?
|
||||
auth::BackendType::Link(_) => return Err(err.into()),
|
||||
};
|
||||
|
||||
if !wait_duration.is_zero() {
|
||||
time::sleep(wait_duration).await;
|
||||
}
|
||||
match wake_res {
|
||||
// there was an error communicating with the control plane
|
||||
Err(e) => return Err(e.into()),
|
||||
// failed to wake up but we can continue to retry
|
||||
Ok(ControlFlow::Continue(_)) => {
|
||||
state = ConnectionState::Invalid(config, err);
|
||||
let wait_duration = retry_after(num_retries);
|
||||
num_retries += 1;
|
||||
|
||||
// try wake the compute node if we have determined it's sensible to do so
|
||||
if let Some(err) = should_wake_with_error.take() {
|
||||
match try_wake(node_info, extra, creds).await {
|
||||
// we can't wake up the compute node
|
||||
Ok(None) => return Err(err),
|
||||
// there was an error communicating with the control plane
|
||||
Err(e) => return Err(io_error(e).into()),
|
||||
// failed to wake up but we can continue to retry
|
||||
Ok(Some(ControlFlow::Continue(()))) => {
|
||||
wait_duration = retry_after(num_retries);
|
||||
should_wake_with_error = Some(err);
|
||||
|
||||
num_retries += 1;
|
||||
info!(num_retries, "retrying wake compute");
|
||||
continue;
|
||||
info!(num_retries, "retrying wake compute");
|
||||
time::sleep(wait_duration).await;
|
||||
continue;
|
||||
}
|
||||
// successfully woke up a compute node and can break the wakeup loop
|
||||
Ok(ControlFlow::Break(mut node_info)) => {
|
||||
node_info.config.reuse_password(&config);
|
||||
mechanism.update_connect_config(&mut node_info.config);
|
||||
state = ConnectionState::Cached(node_info)
|
||||
}
|
||||
}
|
||||
// successfully woke up a compute node and can break the wakeup loop
|
||||
Ok(Some(ControlFlow::Break(()))) => {}
|
||||
}
|
||||
}
|
||||
ConnectionState::Cached(node_info) => {
|
||||
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Err(e) => {
|
||||
error!(error = ?e, "could not connect to compute node");
|
||||
if !e.should_retry(num_retries) {
|
||||
return Err(e.into());
|
||||
}
|
||||
|
||||
// Set a shorter timeout for the initial connection attempt.
|
||||
//
|
||||
// In case we try to connect to an outdated address that is no longer valid, the
|
||||
// default behavior of Kubernetes is to drop the packets, causing us to wait for
|
||||
// the entire timeout period. We want to fail fast in such cases.
|
||||
//
|
||||
// A specific case to consider is when we have cached compute node information
|
||||
// with a 4-minute TTL (Time To Live), but the user has executed a `/suspend` API
|
||||
// call, resulting in the nonexistence of the compute node.
|
||||
//
|
||||
// We only use caching in case of scram proxy backed by the console, so reduce
|
||||
// the timeout only in that case.
|
||||
let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
|
||||
let timeout = if is_scram_proxy && num_retries == 0 {
|
||||
time::Duration::from_secs(2)
|
||||
} else {
|
||||
time::Duration::from_secs(10)
|
||||
};
|
||||
// after the first connect failure,
|
||||
// we should invalidate the cache and wake up a new compute node
|
||||
if num_retries == 0 {
|
||||
state = ConnectionState::Invalid(invalidate_cache(node_info), e);
|
||||
} else {
|
||||
state = ConnectionState::Cached(node_info);
|
||||
}
|
||||
|
||||
// do this again to ensure we have username?
|
||||
node_info.config.set_startup_params(params);
|
||||
let wait_duration = retry_after(num_retries);
|
||||
num_retries += 1;
|
||||
|
||||
match connect_to_compute_once(node_info, timeout).await {
|
||||
Ok(res) => return Ok(res),
|
||||
Err(e) => {
|
||||
error!(error = ?e, "could not connect to compute node");
|
||||
if !can_retry_error(&e, num_retries) {
|
||||
return Err(e);
|
||||
}
|
||||
wait_duration = retry_after(num_retries);
|
||||
|
||||
// after the first connect failure,
|
||||
// we should invalidate the cache and wake up a new compute node
|
||||
if num_retries == 0 {
|
||||
invalidate_cache(node_info);
|
||||
should_wake_with_error = Some(e);
|
||||
info!(num_retries, "retrying wake compute");
|
||||
time::sleep(wait_duration).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
num_retries += 1;
|
||||
info!(num_retries, "retrying connect");
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempts to wake up the compute node.
|
||||
/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
|
||||
/// * Returns Ok(Some(false)) if the wakeup succeeded
|
||||
/// * Returns Ok(None) or Err(e) if there was an error
|
||||
/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
|
||||
/// * Returns Ok(Break(node)) if the wakeup succeeded
|
||||
/// * Returns Err(e) if there was an error
|
||||
pub async fn try_wake(
|
||||
node_info: &mut console::CachedNodeInfo,
|
||||
api: &impl console::Api,
|
||||
extra: &console::ConsoleReqExtra<'_>,
|
||||
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
|
||||
) -> Result<Option<ControlFlow<()>>, WakeComputeError> {
|
||||
creds: &auth::ClientCredentials<'_>,
|
||||
) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
|
||||
info!("compute node's state has likely changed; requesting a wake-up");
|
||||
match creds.wake_compute(extra).await {
|
||||
// retry wake if the compute was in an invalid state
|
||||
Err(WakeComputeError::ApiError(ApiError::Console {
|
||||
status: StatusCode::BAD_REQUEST,
|
||||
..
|
||||
})) => Ok(Some(ControlFlow::Continue(()))),
|
||||
// Update `node_info` and try again.
|
||||
Ok(Some(mut new)) => {
|
||||
new.config.reuse_password(&node_info.config);
|
||||
*node_info = new;
|
||||
Ok(Some(ControlFlow::Break(())))
|
||||
match api.wake_compute(extra, creds).await {
|
||||
Err(err) => match &err {
|
||||
WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
|
||||
_ => Err(err),
|
||||
},
|
||||
// Ready to try again.
|
||||
Ok(new) => Ok(ControlFlow::Break(new)),
|
||||
}
|
||||
}
|
||||
|
||||
pub trait ShouldRetry {
|
||||
fn could_retry(&self) -> bool;
|
||||
fn should_retry(&self, num_retries: u32) -> bool {
|
||||
match self {
|
||||
// retry all errors at least once
|
||||
_ if num_retries == 0 => true,
|
||||
_ if num_retries >= NUM_RETRIES_CONNECT => false,
|
||||
err => err.could_retry(),
|
||||
}
|
||||
Err(e) => Err(e),
|
||||
Ok(None) => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
|
||||
match err {
|
||||
// retry all errors at least once
|
||||
_ if num_retries == 0 => true,
|
||||
_ if num_retries >= NUM_RETRIES_WAKE_COMPUTE => false,
|
||||
compute::ConnectionError::Postgres(err) => can_retry_tokio_postgres_error(err),
|
||||
compute::ConnectionError::CouldNotConnect(err) => is_io_connection_err(err),
|
||||
_ => false,
|
||||
impl ShouldRetry for io::Error {
|
||||
fn could_retry(&self) -> bool {
|
||||
use std::io::ErrorKind;
|
||||
matches!(
|
||||
self.kind(),
|
||||
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn can_retry_tokio_postgres_error(err: &tokio_postgres::Error) -> bool {
|
||||
if let Some(io_err) = err.source().and_then(|x| x.downcast_ref()) {
|
||||
is_io_connection_err(io_err)
|
||||
} else if let Some(db_err) = err.source().and_then(|x| x.downcast_ref()) {
|
||||
is_sql_connection_err(db_err)
|
||||
} else {
|
||||
false
|
||||
impl ShouldRetry for tokio_postgres::error::DbError {
|
||||
fn could_retry(&self) -> bool {
|
||||
use tokio_postgres::error::SqlState;
|
||||
matches!(
|
||||
self.code(),
|
||||
&SqlState::CONNECTION_FAILURE
|
||||
| &SqlState::CONNECTION_EXCEPTION
|
||||
| &SqlState::CONNECTION_DOES_NOT_EXIST
|
||||
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_sql_connection_err(err: &tokio_postgres::error::DbError) -> bool {
|
||||
use tokio_postgres::error::SqlState;
|
||||
matches!(
|
||||
err.code(),
|
||||
&SqlState::CONNECTION_FAILURE
|
||||
| &SqlState::CONNECTION_EXCEPTION
|
||||
| &SqlState::CONNECTION_DOES_NOT_EXIST
|
||||
| &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
|
||||
)
|
||||
impl ShouldRetry for tokio_postgres::Error {
|
||||
fn could_retry(&self) -> bool {
|
||||
if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
|
||||
io::Error::could_retry(io_err)
|
||||
} else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
|
||||
tokio_postgres::error::DbError::could_retry(db_err)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn is_io_connection_err(err: &std::io::Error) -> bool {
|
||||
use std::io::ErrorKind;
|
||||
matches!(
|
||||
err.kind(),
|
||||
ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut
|
||||
)
|
||||
impl ShouldRetry for compute::ConnectionError {
|
||||
fn could_retry(&self) -> bool {
|
||||
match self {
|
||||
compute::ConnectionError::Postgres(err) => err.could_retry(),
|
||||
compute::ConnectionError::CouldNotConnect(err) => err.could_retry(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn retry_after(num_retries: u32) -> time::Duration {
|
||||
@@ -637,7 +704,8 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
|
||||
node_info.allow_self_signed_compute = allow_self_signed_compute;
|
||||
|
||||
let mut node = connect_to_compute(&mut node_info, params, &extra, &creds)
|
||||
let aux = node_info.aux.clone();
|
||||
let mut node = connect_to_compute(&TcpMechanism { params }, node_info, &extra, &creds)
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
@@ -648,6 +716,6 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
|
||||
// immediately after opening the connection.
|
||||
let (stream, read_buf) = stream.into_inner();
|
||||
node.stream.write_all(&read_buf).await?;
|
||||
proxy_pass(stream, node.stream, &node_info.aux).await
|
||||
proxy_pass(stream, node.stream, &aux).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
//! A group of high-level tests for connection establishing logic and auth.
|
||||
use std::borrow::Cow;
|
||||
|
||||
use super::*;
|
||||
use crate::auth::ClientCredentials;
|
||||
use crate::console::{CachedNodeInfo, NodeInfo};
|
||||
use crate::{auth, sasl, scram};
|
||||
use async_trait::async_trait;
|
||||
use rstest::rstest;
|
||||
@@ -304,3 +308,148 @@ fn connect_compute_total_wait() {
|
||||
assert!(total_wait < tokio::time::Duration::from_secs(12));
|
||||
assert!(total_wait > tokio::time::Duration::from_secs(10));
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum ConnectAction {
|
||||
Connect,
|
||||
Retry,
|
||||
Fail,
|
||||
}
|
||||
|
||||
struct TestConnectMechanism {
|
||||
counter: Arc<std::sync::Mutex<usize>>,
|
||||
sequence: Vec<ConnectAction>,
|
||||
}
|
||||
|
||||
impl TestConnectMechanism {
|
||||
fn new(sequence: Vec<ConnectAction>) -> Self {
|
||||
Self {
|
||||
counter: Arc::new(std::sync::Mutex::new(0)),
|
||||
sequence,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestConnection;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct TestConnectError {
|
||||
retryable: bool,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for TestConnectError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for TestConnectError {}
|
||||
|
||||
impl ShouldRetry for TestConnectError {
|
||||
fn could_retry(&self) -> bool {
|
||||
self.retryable
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ConnectMechanism for TestConnectMechanism {
|
||||
type Connection = TestConnection;
|
||||
type ConnectError = TestConnectError;
|
||||
type Error = anyhow::Error;
|
||||
|
||||
async fn connect_once(
|
||||
&self,
|
||||
_node_info: &console::CachedNodeInfo,
|
||||
_timeout: time::Duration,
|
||||
) -> Result<Self::Connection, Self::ConnectError> {
|
||||
let mut counter = self.counter.lock().unwrap();
|
||||
let action = self.sequence[*counter];
|
||||
*counter += 1;
|
||||
match action {
|
||||
ConnectAction::Connect => Ok(TestConnection),
|
||||
ConnectAction::Retry => Err(TestConnectError { retryable: true }),
|
||||
ConnectAction::Fail => Err(TestConnectError { retryable: false }),
|
||||
}
|
||||
}
|
||||
|
||||
fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
|
||||
}
|
||||
|
||||
fn helper_create_connect_info() -> (
|
||||
CachedNodeInfo,
|
||||
console::ConsoleReqExtra<'static>,
|
||||
auth::BackendType<'static, ClientCredentials<'static>>,
|
||||
) {
|
||||
let node = NodeInfo {
|
||||
config: compute::ConnCfg::new(),
|
||||
aux: Default::default(),
|
||||
allow_self_signed_compute: false,
|
||||
};
|
||||
let cache = CachedNodeInfo::new_uncached(node);
|
||||
let extra = console::ConsoleReqExtra {
|
||||
session_id: uuid::Uuid::new_v4(),
|
||||
application_name: Some("TEST"),
|
||||
};
|
||||
let url = "https://TEST_URL".parse().unwrap();
|
||||
let api = console::provider::mock::Api::new(url);
|
||||
let creds = auth::BackendType::Postgres(Cow::Owned(api), ClientCredentials::new_noop());
|
||||
(cache, extra, creds)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_success() {
|
||||
use ConnectAction::*;
|
||||
let mechanism = TestConnectMechanism::new(vec![Connect]);
|
||||
let (cache, extra, creds) = helper_create_connect_info();
|
||||
connect_to_compute(&mechanism, cache, &extra, &creds)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_retry() {
|
||||
use ConnectAction::*;
|
||||
let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Connect]);
|
||||
let (cache, extra, creds) = helper_create_connect_info();
|
||||
connect_to_compute(&mechanism, cache, &extra, &creds)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Test that we don't retry if the error is not retryable.
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_non_retry_1() {
|
||||
use ConnectAction::*;
|
||||
let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Fail]);
|
||||
let (cache, extra, creds) = helper_create_connect_info();
|
||||
connect_to_compute(&mechanism, cache, &extra, &creds)
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
/// Even for non-retryable errors, we should retry at least once.
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_non_retry_2() {
|
||||
use ConnectAction::*;
|
||||
let mechanism = TestConnectMechanism::new(vec![Fail, Retry, Connect]);
|
||||
let (cache, extra, creds) = helper_create_connect_info();
|
||||
connect_to_compute(&mechanism, cache, &extra, &creds)
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Retry for at most `NUM_RETRIES_CONNECT` times.
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_non_retry_3() {
|
||||
assert_eq!(NUM_RETRIES_CONNECT, 10);
|
||||
use ConnectAction::*;
|
||||
let mechanism = TestConnectMechanism::new(vec![
|
||||
Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
|
||||
/* the 11th time */ Retry,
|
||||
]);
|
||||
let (cache, extra, creds) = helper_create_connect_info();
|
||||
connect_to_compute(&mechanism, cache, &extra, &creds)
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
@@ -33,9 +33,10 @@ psutil = "^5.9.4"
|
||||
types-psutil = "^5.9.5.12"
|
||||
types-toml = "^0.10.8.6"
|
||||
pytest-httpserver = "^1.0.8"
|
||||
aiohttp = "3.7.4"
|
||||
aiohttp = "3.8.5"
|
||||
pytest-rerunfailures = "^11.1.2"
|
||||
types-pytest-lazy-fixture = "^0.6.3.3"
|
||||
pytest-split = "^0.8.1"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^23.3.0"
|
||||
@@ -78,6 +79,7 @@ module = [
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py39"
|
||||
extend-exclude = ["vendor/"]
|
||||
ignore = ["E501"]
|
||||
select = [
|
||||
@@ -85,4 +87,5 @@ select = [
|
||||
"F", # Pyflakes
|
||||
"I", # isort
|
||||
"W", # pycodestyle
|
||||
"B", # bugbear
|
||||
]
|
||||
|
||||
@@ -37,7 +37,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
|
||||
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
|
||||
use safekeeper::{wal_backup, HTTP_RUNTIME};
|
||||
use storage_broker::DEFAULT_ENDPOINT;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::auth::{JwtAuth, Scope};
|
||||
use utils::{
|
||||
id::NodeId,
|
||||
logging::{self, LogFormat},
|
||||
@@ -72,6 +72,10 @@ struct Args {
|
||||
/// Listen endpoint for receiving/sending WAL in the form host:port.
|
||||
#[arg(short, long, default_value = DEFAULT_PG_LISTEN_ADDR)]
|
||||
listen_pg: String,
|
||||
/// Listen endpoint for receiving/sending WAL in the form host:port allowing
|
||||
/// only tenant scoped auth tokens. Pointless if auth is disabled.
|
||||
#[arg(long, default_value = None, verbatim_doc_comment)]
|
||||
listen_pg_tenant_only: Option<String>,
|
||||
/// Listen http endpoint for management and metrics in the form host:port.
|
||||
#[arg(long, default_value = DEFAULT_HTTP_LISTEN_ADDR)]
|
||||
listen_http: String,
|
||||
@@ -94,7 +98,7 @@ struct Args {
|
||||
broker_keepalive_interval: Duration,
|
||||
/// Peer safekeeper is considered dead after not receiving heartbeats from
|
||||
/// it during this period passed as a human readable duration.
|
||||
#[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT)]
|
||||
#[arg(long, value_parser= humantime::parse_duration, default_value = DEFAULT_HEARTBEAT_TIMEOUT, verbatim_doc_comment)]
|
||||
heartbeat_timeout: Duration,
|
||||
/// Remote storage configuration for WAL backup (offloading to s3) as TOML
|
||||
/// inline table, e.g.
|
||||
@@ -179,6 +183,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
workdir,
|
||||
my_id: id,
|
||||
listen_pg_addr: args.listen_pg,
|
||||
listen_pg_addr_tenant_only: args.listen_pg_tenant_only,
|
||||
listen_http_addr: args.listen_http,
|
||||
availability_zone: args.availability_zone,
|
||||
no_sync: args.no_sync,
|
||||
@@ -222,6 +227,21 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
e
|
||||
})?;
|
||||
|
||||
let pg_listener_tenant_only =
|
||||
if let Some(listen_pg_addr_tenant_only) = &conf.listen_pg_addr_tenant_only {
|
||||
info!(
|
||||
"starting safekeeper tenant scoped WAL service on {}",
|
||||
listen_pg_addr_tenant_only
|
||||
);
|
||||
let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
})?;
|
||||
Some(listener)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
info!(
|
||||
"starting safekeeper HTTP service on {}",
|
||||
conf.listen_http_addr
|
||||
@@ -253,14 +273,34 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let current_thread_rt = conf
|
||||
.current_thread_runtime
|
||||
.then(|| Handle::try_current().expect("no runtime in main"));
|
||||
|
||||
let wal_service_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
.spawn(wal_service::task_main(conf_, pg_listener))
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
pg_listener,
|
||||
Some(Scope::SafekeeperData),
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_service_handle));
|
||||
|
||||
if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
|
||||
let conf_ = conf.clone();
|
||||
let wal_service_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
.spawn(wal_service::task_main(
|
||||
conf_,
|
||||
pg_listener_tenant_only,
|
||||
Some(Scope::Tenant),
|
||||
))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service tenant only main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_service_handle));
|
||||
}
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let http_handle = current_thread_rt
|
||||
.as_ref()
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user