mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-07 04:30:36 +00:00
Compare commits
173 Commits
problame/p
...
fast-btree
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69eb02363b | ||
|
|
ad5b02e175 | ||
|
|
b09a851705 | ||
|
|
85cd97af61 | ||
|
|
e6470ee92e | ||
|
|
dc72567288 | ||
|
|
6defa2b5d5 | ||
|
|
b3d3a2587d | ||
|
|
b85fc39bdb | ||
|
|
09b5954526 | ||
|
|
306c4f9967 | ||
|
|
5ceccdc7de | ||
|
|
cdcaa329bf | ||
|
|
27bdbf5e36 | ||
|
|
4c7fa12a2a | ||
|
|
367971a0e9 | ||
|
|
51570114ea | ||
|
|
098d3111a5 | ||
|
|
3737fe3a4b | ||
|
|
5650138532 | ||
|
|
2dca4c03fc | ||
|
|
0b790b6d00 | ||
|
|
e82d1ad6b8 | ||
|
|
4f0a8e92ad | ||
|
|
5952f350cb | ||
|
|
726c8e6730 | ||
|
|
f7067a38b7 | ||
|
|
896347f307 | ||
|
|
e5c81fef86 | ||
|
|
7ebe9ca1ac | ||
|
|
1588601503 | ||
|
|
9c35e1e6e5 | ||
|
|
d8c21ec70d | ||
|
|
ad99fa5f03 | ||
|
|
e675f4cec8 | ||
|
|
4db8efb2cf | ||
|
|
07c2b29895 | ||
|
|
9cdffd164a | ||
|
|
87db4b441c | ||
|
|
964c5c56b7 | ||
|
|
bd59349af3 | ||
|
|
2bd79906d9 | ||
|
|
493b47e1da | ||
|
|
c13e932c3b | ||
|
|
a5292f7e67 | ||
|
|
262348e41b | ||
|
|
68f15cf967 | ||
|
|
39f8fd6945 | ||
|
|
83567f9e4e | ||
|
|
71611f4ab3 | ||
|
|
7c16b5215e | ||
|
|
39b148b74e | ||
|
|
116c342cad | ||
|
|
ba4fe9e10f | ||
|
|
de90bf4663 | ||
|
|
8360307ea0 | ||
|
|
6129077d31 | ||
|
|
e0ebdfc7ce | ||
|
|
c508d3b5fa | ||
|
|
acda65d7d4 | ||
|
|
378daa358b | ||
|
|
85f4514e7d | ||
|
|
f70019797c | ||
|
|
325258413a | ||
|
|
4ddbc0e46d | ||
|
|
a673e4e7a9 | ||
|
|
c155cc0c3f | ||
|
|
32126d705b | ||
|
|
5683ae9eab | ||
|
|
4778b6a12e | ||
|
|
8b8be7bed4 | ||
|
|
a461c459d8 | ||
|
|
4ae2d1390d | ||
|
|
c5949e1fd6 | ||
|
|
127837abb0 | ||
|
|
b2c96047d0 | ||
|
|
44202eeb3b | ||
|
|
4bef977c56 | ||
|
|
a0b862a8bd | ||
|
|
767ef29390 | ||
|
|
a8a800af51 | ||
|
|
1e250cd90a | ||
|
|
eaaa18f6ed | ||
|
|
188f67e1df | ||
|
|
7e805200bb | ||
|
|
c6ca1d76d2 | ||
|
|
94b4e76e13 | ||
|
|
b514da90cb | ||
|
|
7d17f1719f | ||
|
|
41ee75bc71 | ||
|
|
11e523f503 | ||
|
|
b1a1126152 | ||
|
|
a8899e1e0f | ||
|
|
2fbd5ab075 | ||
|
|
702382e99a | ||
|
|
1b53b3e200 | ||
|
|
b332268cec | ||
|
|
76c702219c | ||
|
|
ba856140e7 | ||
|
|
2cf6a47cca | ||
|
|
5a8bcdccb0 | ||
|
|
2c8741a5ed | ||
|
|
893b7bac9a | ||
|
|
66f8f5f1c8 | ||
|
|
3a19da1066 | ||
|
|
572eda44ee | ||
|
|
b1d6af5ebe | ||
|
|
f842b22b90 | ||
|
|
d444d4dcea | ||
|
|
c8637f3736 | ||
|
|
ecf759be6d | ||
|
|
9a9d9eba42 | ||
|
|
1f4805baf8 | ||
|
|
5c88213eaf | ||
|
|
607d19f0e0 | ||
|
|
1fa0478980 | ||
|
|
9da67c4f19 | ||
|
|
16c87b5bda | ||
|
|
9fe5cc6a82 | ||
|
|
543b8153c6 | ||
|
|
3a8959a4c4 | ||
|
|
4a50483861 | ||
|
|
f775928dfc | ||
|
|
ea648cfbc6 | ||
|
|
093f8c5f45 | ||
|
|
00c71bb93a | ||
|
|
9256788273 | ||
|
|
9e1449353d | ||
|
|
b06dffe3dc | ||
|
|
b08a0ee186 | ||
|
|
3666df6342 | ||
|
|
0ca342260c | ||
|
|
ded7f48565 | ||
|
|
e09d5ada6a | ||
|
|
8c522ea034 | ||
|
|
44b1c4c456 | ||
|
|
99c15907c1 | ||
|
|
c3626e3432 | ||
|
|
dd6990567f | ||
|
|
21deb81acb | ||
|
|
dbb21d6592 | ||
|
|
ddceb9e6cd | ||
|
|
0fc3708de2 | ||
|
|
e0c8ad48d4 | ||
|
|
39e144696f | ||
|
|
653044f754 | ||
|
|
80dcdfa8bf | ||
|
|
685add2009 | ||
|
|
d4dc86f8e3 | ||
|
|
5158de70f3 | ||
|
|
aec9188d36 | ||
|
|
acefee9a32 | ||
|
|
bf065aabdf | ||
|
|
fe74fac276 | ||
|
|
b91ac670e1 | ||
|
|
b3195afd20 | ||
|
|
7eaa7a496b | ||
|
|
4772cd6c93 | ||
|
|
010b4d0d5c | ||
|
|
477cb3717b | ||
|
|
ea5a97e7b4 | ||
|
|
547914fe19 | ||
|
|
607b185a49 | ||
|
|
bfba5e3aca | ||
|
|
ecc7a9567b | ||
|
|
45f98dd018 | ||
|
|
bdfe27f3ac | ||
|
|
a15f9b3baa | ||
|
|
ce92638185 | ||
|
|
a3c82f19b8 | ||
|
|
8b15252f98 | ||
|
|
522aaca718 | ||
|
|
7cbb39063a |
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -5,4 +5,6 @@ self-hosted-runner:
|
||||
- small
|
||||
- us-east-2
|
||||
config-variables:
|
||||
- REMOTE_STORAGE_AZURE_CONTAINER
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
|
||||
@@ -203,6 +203,10 @@ runs:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
BASE_S3_URL: ${{ steps.generate-report.outputs.base-s3-url }}
|
||||
run: |
|
||||
if [ ! -d "${WORKDIR}/report/data/test-cases" ]; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
export DATABASE_URL=${REGRESS_TEST_RESULT_CONNSTR_NEW}
|
||||
|
||||
./scripts/pysync
|
||||
|
||||
23
.github/workflows/build_and_test.yml
vendored
23
.github/workflows/build_and_test.yml
vendored
@@ -320,6 +320,9 @@ jobs:
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: mold -run make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
@@ -335,6 +338,16 @@ jobs:
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
|
||||
|
||||
# Run separate tests for real Azure Blob Storage
|
||||
# XXX: replace region with `eu-central-1`-like region
|
||||
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
|
||||
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
|
||||
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
|
||||
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
|
||||
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
# Install target binaries
|
||||
@@ -420,7 +433,7 @@ jobs:
|
||||
rerun_flaky: true
|
||||
pg_version: ${{ matrix.pg_version }}
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
@@ -455,7 +468,7 @@ jobs:
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}"
|
||||
TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
@@ -834,7 +847,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.17.12
|
||||
VM_BUILDER_VERSION: v0.18.5
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -1092,8 +1105,10 @@ jobs:
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
|
||||
# TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
|
||||
gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
|
||||
18
.github/workflows/neon_extra_builds.yml
vendored
18
.github/workflows/neon_extra_builds.yml
vendored
@@ -32,7 +32,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -90,18 +90,21 @@ jobs:
|
||||
|
||||
- name: Build postgres v14
|
||||
if: steps.cache_pg_14.outputs.cache-hit != 'true'
|
||||
run: make postgres-v14 -j$(nproc)
|
||||
run: make postgres-v14 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build postgres v15
|
||||
if: steps.cache_pg_15.outputs.cache-hit != 'true'
|
||||
run: make postgres-v15 -j$(nproc)
|
||||
run: make postgres-v15 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build postgres v16
|
||||
if: steps.cache_pg_16.outputs.cache-hit != 'true'
|
||||
run: make postgres-v16 -j$(nproc)
|
||||
run: make postgres-v16 -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext -j$(nproc)
|
||||
run: make neon-pg-ext -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: make walproposer-lib -j$(sysctl -n hw.ncpu)
|
||||
|
||||
- name: Run cargo build
|
||||
run: cargo build --all --release
|
||||
@@ -126,7 +129,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
@@ -135,6 +138,9 @@ jobs:
|
||||
- name: Get postgres headers
|
||||
run: make postgres-headers -j$(nproc)
|
||||
|
||||
- name: Build walproposer-lib
|
||||
run: make walproposer-lib -j$(nproc)
|
||||
|
||||
- name: Produce the build stats
|
||||
run: cargo build --all --release --timings
|
||||
|
||||
|
||||
2
.github/workflows/release.yml
vendored
2
.github/workflows/release.yml
vendored
@@ -2,7 +2,7 @@ name: Create Release Branch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 7 * * 2'
|
||||
- cron: '0 7 * * 5'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
1051
Cargo.lock
generated
1051
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
20
Cargo.toml
20
Cargo.toml
@@ -26,6 +26,7 @@ members = [
|
||||
"libs/tracing-utils",
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/walproposer",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -36,6 +37,10 @@ license = "Apache-2.0"
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
|
||||
azure_core = "0.16"
|
||||
azure_identity = "0.16"
|
||||
azure_storage = "0.16"
|
||||
azure_storage_blobs = "0.16"
|
||||
flate2 = "1.0.26"
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
@@ -76,6 +81,7 @@ hex = "0.4"
|
||||
hex-literal = "0.4"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.3.1"
|
||||
http-types = "2"
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
@@ -118,6 +124,7 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
serde_assert = "0.5.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
smallvec = "1.11"
|
||||
@@ -155,11 +162,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
## Other git libraries
|
||||
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
|
||||
@@ -180,6 +187,7 @@ tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
@@ -195,7 +203,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
|
||||
@@ -224,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
|
||||
FROM build-deps AS vector-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
|
||||
echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
|
||||
echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
|
||||
mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
|
||||
@@ -368,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
|
||||
FROM build-deps AS plpgsql-check-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
|
||||
echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
|
||||
mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
|
||||
38
Makefile
38
Makefile
@@ -62,7 +62,7 @@ all: neon postgres neon-pg-ext
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: neon
|
||||
neon: postgres-headers
|
||||
neon: postgres-headers walproposer-lib
|
||||
+@echo "Compiling Neon"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
|
||||
@@ -168,6 +168,42 @@ neon-pg-ext-clean-%:
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile clean
|
||||
|
||||
# Build walproposer as a static library. walproposer source code is located
|
||||
# in the pgxn/neon directory.
|
||||
#
|
||||
# We also need to include libpgport.a and libpgcommon.a, because walproposer
|
||||
# uses some functions from those libraries.
|
||||
#
|
||||
# Some object files are removed from libpgport.a and libpgcommon.a because
|
||||
# they depend on openssl and other libraries that are not included in our
|
||||
# Rust build.
|
||||
.PHONY: walproposer-lib
|
||||
walproposer-lib: neon-pg-ext-v16
|
||||
+@echo "Compiling walproposer-lib"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgport.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
cp $(POSTGRES_INSTALL_DIR)/v16/lib/libpgcommon.a $(POSTGRES_INSTALL_DIR)/build/walproposer-lib
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgport.a \
|
||||
pg_strong_random.o
|
||||
$(AR) d $(POSTGRES_INSTALL_DIR)/build/walproposer-lib/libpgcommon.a \
|
||||
pg_crc32c.o \
|
||||
hmac_openssl.o \
|
||||
cryptohash_openssl.o \
|
||||
scram-common.o \
|
||||
md5_common.o \
|
||||
checksum_helper.o
|
||||
endif
|
||||
|
||||
.PHONY: walproposer-lib-clean
|
||||
walproposer-lib-clean:
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config \
|
||||
-C $(POSTGRES_INSTALL_DIR)/build/walproposer-lib \
|
||||
-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
|
||||
|
||||
.PHONY: neon-pg-ext
|
||||
neon-pg-ext: \
|
||||
neon-pg-ext-v14 \
|
||||
|
||||
4
NOTICE
4
NOTICE
@@ -1,5 +1,5 @@
|
||||
Neon
|
||||
Copyright 2022 Neon Inc.
|
||||
|
||||
The PostgreSQL submodules in vendor/postgres-v14 and vendor/postgres-v15 are licensed under the
|
||||
PostgreSQL license. See vendor/postgres-v14/COPYRIGHT and vendor/postgres-v15/COPYRIGHT.
|
||||
The PostgreSQL submodules in vendor/ are licensed under the PostgreSQL license.
|
||||
See vendor/postgres-vX/COPYRIGHT for details.
|
||||
|
||||
@@ -156,6 +156,7 @@ fn main() -> Result<()> {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
spec = Some(serde_json::from_reader(file)?);
|
||||
live_config_allowed = true;
|
||||
} else if let Some(id) = compute_id {
|
||||
if let Some(cp_base) = control_plane_uri {
|
||||
live_config_allowed = true;
|
||||
@@ -277,32 +278,26 @@ fn main() -> Result<()> {
|
||||
if #[cfg(target_os = "linux")] {
|
||||
use std::env;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::warn;
|
||||
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
|
||||
let vm_monitor_addr = matches
|
||||
.get_one::<String>("vm-monitor-addr")
|
||||
.expect("--vm-monitor-addr should always be set because it has a default arg");
|
||||
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
|
||||
let cgroup = matches.get_one::<String>("cgroup");
|
||||
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
|
||||
|
||||
// Only make a runtime if we need to.
|
||||
// Note: it seems like you can make a runtime in an inner scope and
|
||||
// if you start a task in it it won't be dropped. However, make it
|
||||
// in the outermost scope just to be safe.
|
||||
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
|
||||
(None, None) => None,
|
||||
(None, Some(_)) => {
|
||||
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
|
||||
None
|
||||
}
|
||||
(Some(_), None) => {
|
||||
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
|
||||
}
|
||||
(Some(_), Some(_)) => Some(
|
||||
let rt = if env::var_os("AUTOSCALING").is_some() {
|
||||
Some(
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.worker_threads(4)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create tokio runtime for monitor"),
|
||||
),
|
||||
.expect("failed to create tokio runtime for monitor")
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// This token is used internally by the monitor to clean up all threads
|
||||
@@ -313,8 +308,7 @@ fn main() -> Result<()> {
|
||||
Box::leak(Box::new(vm_monitor::Args {
|
||||
cgroup: cgroup.cloned(),
|
||||
pgconnstr: file_cache_connstr.cloned(),
|
||||
addr: vm_monitor_addr.cloned().unwrap(),
|
||||
file_cache_on_disk,
|
||||
addr: vm_monitor_addr.clone(),
|
||||
})),
|
||||
token.clone(),
|
||||
))
|
||||
@@ -486,6 +480,8 @@ fn cli() -> clap::Command {
|
||||
.value_name("FILECACHE_CONNSTR"),
|
||||
)
|
||||
.arg(
|
||||
// DEPRECATED, NO LONGER DOES ANYTHING.
|
||||
// See https://github.com/neondatabase/cloud/issues/7516
|
||||
Arg::new("file-cache-on-disk")
|
||||
.long("file-cache-on-disk")
|
||||
.action(clap::ArgAction::SetTrue),
|
||||
|
||||
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
IF array_length(roles, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT neon_superuser TO %s',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||
@@ -692,10 +692,11 @@ impl ComputeNode {
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||
create_neon_superuser(spec, &mut client)?;
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, self.connstr.as_str())?;
|
||||
handle_grants(spec, &mut client, self.connstr.as_str())?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
create_availability_check_data(&mut client)?;
|
||||
|
||||
@@ -731,10 +732,11 @@ impl ComputeNode {
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
if spec.mode == ComputeMode::Primary {
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(&spec, &mut client)?;
|
||||
handle_databases(&spec, &mut client)?;
|
||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(&spec, self.connstr.as_str())?;
|
||||
handle_grants(&spec, &mut client, self.connstr.as_str())?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Write;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
@@ -192,11 +193,16 @@ impl Escaping for PgIdent {
|
||||
/// Build a list of existing Postgres roles
|
||||
pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
|
||||
let postgres_roles = xact
|
||||
.query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
|
||||
.query(
|
||||
"SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| Role {
|
||||
name: row.get("rolname"),
|
||||
encrypted_password: row.get("rolpassword"),
|
||||
replication: Some(row.get("rolreplication")),
|
||||
bypassrls: Some(row.get("rolbypassrls")),
|
||||
options: None,
|
||||
})
|
||||
.collect();
|
||||
@@ -205,22 +211,37 @@ pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
|
||||
}
|
||||
|
||||
/// Build a list of existing Postgres databases
|
||||
pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
||||
let postgres_dbs = client
|
||||
pub fn get_existing_dbs(client: &mut Client) -> Result<HashMap<String, Database>> {
|
||||
// `pg_database.datconnlimit = -2` means that the database is in the
|
||||
// invalid state. See:
|
||||
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
|
||||
let postgres_dbs: Vec<Database> = client
|
||||
.query(
|
||||
"SELECT datname, datdba::regrole::text as owner
|
||||
FROM pg_catalog.pg_database;",
|
||||
"SELECT
|
||||
datname AS name,
|
||||
datdba::regrole::text AS owner,
|
||||
NOT datallowconn AS restrict_conn,
|
||||
datconnlimit = - 2 AS invalid
|
||||
FROM
|
||||
pg_catalog.pg_database;",
|
||||
&[],
|
||||
)?
|
||||
.iter()
|
||||
.map(|row| Database {
|
||||
name: row.get("datname"),
|
||||
name: row.get("name"),
|
||||
owner: row.get("owner"),
|
||||
restrict_conn: row.get("restrict_conn"),
|
||||
invalid: row.get("invalid"),
|
||||
options: None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(postgres_dbs)
|
||||
let dbs_map = postgres_dbs
|
||||
.iter()
|
||||
.map(|db| (db.name.clone(), db.clone()))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
Ok(dbs_map)
|
||||
}
|
||||
|
||||
/// Wait for Postgres to become ready to accept connections. It's ready to
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
|
||||
use compute_api::spec::{ComputeSpec, Database, PgIdent, Role};
|
||||
use compute_api::spec::{ComputeSpec, PgIdent, Role};
|
||||
|
||||
// Do control plane request and return response if any. In case of error it
|
||||
// returns a bool flag indicating whether it makes sense to retry the request
|
||||
@@ -24,7 +24,7 @@ fn do_control_plane_request(
|
||||
) -> Result<ControlPlaneSpecResponse, (bool, String)> {
|
||||
let resp = reqwest::blocking::Client::new()
|
||||
.get(uri)
|
||||
.header("Authorization", jwt)
|
||||
.header("Authorization", format!("Bearer {}", jwt))
|
||||
.send()
|
||||
.map_err(|e| {
|
||||
(
|
||||
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
|
||||
base_uri: &str,
|
||||
compute_id: &str,
|
||||
) -> Result<Option<ComputeSpec>> {
|
||||
let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
|
||||
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
|
||||
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
Err(_) => "".to_string(),
|
||||
@@ -161,6 +161,38 @@ pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute could be unexpectedly shut down, for example, during the
|
||||
/// database dropping. This leaves the database in the invalid state,
|
||||
/// which prevents new db creation with the same name. This function
|
||||
/// will clean it up before proceeding with catalog updates. All
|
||||
/// possible future cleanup operations may go here too.
|
||||
#[instrument(skip_all)]
|
||||
pub fn cleanup_instance(client: &mut Client) -> Result<()> {
|
||||
let existing_dbs = get_existing_dbs(client)?;
|
||||
|
||||
for (_, db) in existing_dbs {
|
||||
if db.invalid {
|
||||
// After recent commit in Postgres, interrupted DROP DATABASE
|
||||
// leaves the database in the invalid state. According to the
|
||||
// commit message, the only option for user is to drop it again.
|
||||
// See:
|
||||
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
|
||||
//
|
||||
// Postgres Neon extension is done the way, that db is de-registered
|
||||
// in the control plane metadata only after it is dropped. So there is
|
||||
// a chance that it still thinks that db should exist. This means
|
||||
// that it will be re-created by `handle_databases()`. Yet, it's fine
|
||||
// as user can just repeat drop (in vanilla Postgres they would need
|
||||
// to do the same, btw).
|
||||
let query = format!("DROP DATABASE IF EXISTS {}", db.name.pg_quote());
|
||||
info!("dropping invalid database {}", db.name);
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Given a cluster spec json and open transaction it handles roles creation,
|
||||
/// deletion and update.
|
||||
#[instrument(skip_all)]
|
||||
@@ -233,6 +265,8 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let action = if let Some(r) = pg_role {
|
||||
if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
|
||||
|| (r.encrypted_password.is_some() && role.encrypted_password.is_none())
|
||||
|| !r.bypassrls.unwrap_or(false)
|
||||
|| !r.replication.unwrap_or(false)
|
||||
{
|
||||
RoleAction::Update
|
||||
} else if let Some(pg_pwd) = &r.encrypted_password {
|
||||
@@ -264,13 +298,14 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
match action {
|
||||
RoleAction::None => {}
|
||||
RoleAction::Update => {
|
||||
let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
|
||||
let mut query: String =
|
||||
format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
RoleAction::Create => {
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
@@ -379,13 +414,13 @@ fn reassign_owned_objects(spec: &ComputeSpec, connstr: &str, role_name: &PgIdent
|
||||
/// which together provide us idempotency.
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
|
||||
let existing_dbs = get_existing_dbs(client)?;
|
||||
|
||||
// Print a list of existing Postgres databases (only in debug mode)
|
||||
if span_enabled!(Level::INFO) {
|
||||
info!("postgres databases:");
|
||||
for r in &existing_dbs {
|
||||
info!(" {}:{}", r.name, r.owner);
|
||||
for (dbname, db) in &existing_dbs {
|
||||
info!(" {}:{}", dbname, db.owner);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -439,8 +474,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
"rename_db" => {
|
||||
let new_name = op.new_name.as_ref().unwrap();
|
||||
|
||||
// XXX: with a limited number of roles it is fine, but consider making it a HashMap
|
||||
if existing_dbs.iter().any(|r| r.name == op.name) {
|
||||
if existing_dbs.get(&op.name).is_some() {
|
||||
let query: String = format!(
|
||||
"ALTER DATABASE {} RENAME TO {}",
|
||||
op.name.pg_quote(),
|
||||
@@ -457,14 +491,12 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
}
|
||||
|
||||
// Refresh Postgres databases info to handle possible renames
|
||||
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
|
||||
let existing_dbs = get_existing_dbs(client)?;
|
||||
|
||||
info!("cluster spec databases:");
|
||||
for db in &spec.cluster.databases {
|
||||
let name = &db.name;
|
||||
|
||||
// XXX: with a limited number of databases it is fine, but consider making it a HashMap
|
||||
let pg_db = existing_dbs.iter().find(|r| r.name == *name);
|
||||
let pg_db = existing_dbs.get(name);
|
||||
|
||||
enum DatabaseAction {
|
||||
None,
|
||||
@@ -530,13 +562,32 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) -> Result<()> {
|
||||
info!("modifying database permissions");
|
||||
let existing_dbs = get_existing_dbs(client)?;
|
||||
|
||||
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||
// atomically.
|
||||
for db in &spec.cluster.databases {
|
||||
match existing_dbs.get(&db.name) {
|
||||
Some(pg_db) => {
|
||||
if pg_db.restrict_conn || pg_db.invalid {
|
||||
info!(
|
||||
"skipping grants for db {} (invalid: {}, connections not allowed: {})",
|
||||
db.name, pg_db.invalid, pg_db.restrict_conn
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
None => {
|
||||
bail!(
|
||||
"database {} doesn't exist in Postgres after handle_databases()",
|
||||
db.name
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let mut conf = Config::from_str(connstr)?;
|
||||
conf.dbname(&db.name);
|
||||
|
||||
@@ -575,6 +626,11 @@ pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
|
||||
|
||||
// Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
|
||||
// This is needed because since postgres 15 this privilege is removed by default.
|
||||
// TODO: web_access isn't created for almost 1 year. It could be that we have
|
||||
// active users of 1 year old projects, but hopefully not, so check it and
|
||||
// remove this code if possible. The worst thing that could happen is that
|
||||
// user won't be able to use public schema in NEW databases created in the
|
||||
// very OLD project.
|
||||
let grant_query = "DO $$\n\
|
||||
BEGIN\n\
|
||||
IF EXISTS(\n\
|
||||
|
||||
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
|
||||
assert_eq!(
|
||||
spec.cluster.settings.as_pg_settings(),
|
||||
r#"fsync = off
|
||||
wal_level = replica
|
||||
wal_level = logical
|
||||
hot_standby = on
|
||||
neon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'
|
||||
wal_log_hints = on
|
||||
|
||||
@@ -2,7 +2,6 @@ use crate::{background_process, local_env::LocalEnv};
|
||||
use anyhow::anyhow;
|
||||
use camino::Utf8PathBuf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::{path::PathBuf, process::Child};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
@@ -14,12 +13,10 @@ pub struct AttachmentService {
|
||||
|
||||
const COMMAND: &str = "attachment_service";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct AttachHookRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
pub pageserver_id: Option<NodeId>,
|
||||
pub node_id: Option<NodeId>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -85,7 +82,7 @@ impl AttachmentService {
|
||||
.control_plane_api
|
||||
.clone()
|
||||
.unwrap()
|
||||
.join("attach_hook")
|
||||
.join("attach-hook")
|
||||
.unwrap();
|
||||
let client = reqwest::blocking::ClientBuilder::new()
|
||||
.build()
|
||||
@@ -93,7 +90,7 @@ impl AttachmentService {
|
||||
|
||||
let request = AttachHookRequest {
|
||||
tenant_id,
|
||||
pageserver_id: Some(pageserver_id),
|
||||
node_id: Some(pageserver_id),
|
||||
};
|
||||
|
||||
let response = client.post(url).json(&request).send()?;
|
||||
|
||||
@@ -86,7 +86,7 @@ where
|
||||
.stdout(process_log_file)
|
||||
.stderr(same_file_for_stderr)
|
||||
.args(args);
|
||||
let filled_cmd = fill_aws_secrets_vars(fill_rust_env_vars(background_command));
|
||||
let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
|
||||
filled_cmd.envs(envs);
|
||||
|
||||
let pid_file_to_check = match initial_pid_file {
|
||||
@@ -238,11 +238,13 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
filled_cmd
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in [
|
||||
"AWS_ACCESS_KEY_ID",
|
||||
"AWS_SECRET_ACCESS_KEY",
|
||||
"AWS_SESSION_TOKEN",
|
||||
"AZURE_STORAGE_ACCOUNT",
|
||||
"AZURE_STORAGE_ACCESS_KEY",
|
||||
] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
|
||||
@@ -12,7 +12,9 @@ use hyper::{Body, Request, Response};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{collections::HashMap, sync::Arc};
|
||||
use utils::http::endpoint::request_span;
|
||||
use utils::logging::{self, LogFormat};
|
||||
use utils::signals::{ShutdownSignals, Signal};
|
||||
|
||||
use utils::{
|
||||
http::{
|
||||
@@ -170,7 +172,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
|
||||
state.generation += 1;
|
||||
response.tenants.push(ReAttachResponseTenant {
|
||||
id: *t,
|
||||
generation: state.generation,
|
||||
gen: state.generation,
|
||||
});
|
||||
}
|
||||
}
|
||||
@@ -216,14 +218,31 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
.tenants
|
||||
.entry(attach_req.tenant_id)
|
||||
.or_insert_with(|| TenantState {
|
||||
pageserver: attach_req.pageserver_id,
|
||||
pageserver: attach_req.node_id,
|
||||
generation: 0,
|
||||
});
|
||||
|
||||
if attach_req.pageserver_id.is_some() {
|
||||
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
|
||||
tenant_state.generation += 1;
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
ps_id = %attaching_pageserver,
|
||||
generation = %tenant_state.generation,
|
||||
"issuing",
|
||||
);
|
||||
} else if let Some(ps_id) = tenant_state.pageserver {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
%ps_id,
|
||||
generation = %tenant_state.generation,
|
||||
"dropping",
|
||||
);
|
||||
} else {
|
||||
tracing::info!(
|
||||
tenant_id = %attach_req.tenant_id,
|
||||
"no-op: tenant already has no pageserver");
|
||||
}
|
||||
tenant_state.pageserver = attach_req.pageserver_id;
|
||||
tenant_state.pageserver = attach_req.node_id;
|
||||
let generation = tenant_state.generation;
|
||||
|
||||
locked.save().await.map_err(ApiError::InternalServerError)?;
|
||||
@@ -231,7 +250,7 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
AttachHookResponse {
|
||||
gen: attach_req.pageserver_id.map(|_| generation),
|
||||
gen: attach_req.node_id.map(|_| generation),
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -239,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
|
||||
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
endpoint::make_router()
|
||||
.data(Arc::new(State::new(persistent_state)))
|
||||
.post("/re-attach", handle_re_attach)
|
||||
.post("/validate", handle_validate)
|
||||
.post("/attach_hook", handle_attach_hook)
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
@@ -268,7 +287,16 @@ async fn main() -> anyhow::Result<()> {
|
||||
let server = hyper::Server::from_tcp(http_listener)?.serve(service);
|
||||
|
||||
tracing::info!("Serving on {0}", args.listen);
|
||||
server.await?;
|
||||
|
||||
tokio::task::spawn(server);
|
||||
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Interrupt | Signal::Terminate | Signal::Quit => {
|
||||
tracing::info!("Got {}. Terminating", signal.name());
|
||||
// We're just a test helper: no graceful shutdown.
|
||||
std::process::exit(0);
|
||||
}
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -116,6 +116,7 @@ fn main() -> Result<()> {
|
||||
"attachment_service" => handle_attachment_service(sub_args, &env),
|
||||
"safekeeper" => handle_safekeeper(sub_args, &env),
|
||||
"endpoint" => handle_endpoint(sub_args, &env),
|
||||
"mappings" => handle_mappings(sub_args, &mut env),
|
||||
"pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
|
||||
_ => bail!("unexpected subcommand {sub_name}"),
|
||||
};
|
||||
@@ -797,6 +798,24 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
}
|
||||
}
|
||||
"reconfigure" => {
|
||||
let endpoint_id = sub_args
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID provided to reconfigure"))?;
|
||||
let endpoint = cplane
|
||||
.endpoints
|
||||
.get(endpoint_id.as_str())
|
||||
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
|
||||
let pageserver_id =
|
||||
if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
|
||||
Some(NodeId(
|
||||
id_str.parse().context("while parsing pageserver id")?,
|
||||
))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
endpoint.reconfigure(pageserver_id)?;
|
||||
}
|
||||
"stop" => {
|
||||
let endpoint_id = sub_args
|
||||
.get_one::<String>("endpoint_id")
|
||||
@@ -816,6 +835,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
|
||||
let (sub_name, sub_args) = match sub_match.subcommand() {
|
||||
Some(ep_subcommand_data) => ep_subcommand_data,
|
||||
None => bail!("no mappings subcommand provided"),
|
||||
};
|
||||
|
||||
match sub_name {
|
||||
"map" => {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
.expect("branch-name argument missing");
|
||||
|
||||
let tenant_id = sub_args
|
||||
.get_one::<String>("tenant-id")
|
||||
.map(|x| TenantId::from_str(x))
|
||||
.expect("tenant-id argument missing")
|
||||
.expect("malformed tenant-id arg");
|
||||
|
||||
let timeline_id = sub_args
|
||||
.get_one::<String>("timeline-id")
|
||||
.map(|x| TimelineId::from_str(x))
|
||||
.expect("timeline-id argument missing")
|
||||
.expect("malformed timeline-id arg");
|
||||
|
||||
env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
other => unimplemented!("mappings subcommand {other}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
|
||||
let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
|
||||
@@ -1084,6 +1135,7 @@ fn cli() -> Command {
|
||||
// --id, when using a pageserver command
|
||||
let pageserver_id_arg = Arg::new("pageserver-id")
|
||||
.long("id")
|
||||
.global(true)
|
||||
.help("pageserver id")
|
||||
.required(false);
|
||||
// --pageserver-id when using a non-pageserver command
|
||||
@@ -1254,17 +1306,20 @@ fn cli() -> Command {
|
||||
Command::new("pageserver")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage pageserver")
|
||||
.arg(pageserver_id_arg)
|
||||
.subcommand(Command::new("status"))
|
||||
.arg(pageserver_id_arg.clone())
|
||||
.subcommand(Command::new("start").about("Start local pageserver")
|
||||
.arg(pageserver_id_arg.clone())
|
||||
.arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("stop").about("Stop local pageserver")
|
||||
.arg(pageserver_id_arg.clone())
|
||||
.arg(stop_mode_arg.clone()))
|
||||
.subcommand(Command::new("restart").about("Restart local pageserver")
|
||||
.arg(pageserver_id_arg.clone())
|
||||
.arg(pageserver_config_args.clone()))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
.arg(stop_mode_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("restart")
|
||||
.about("Restart local pageserver")
|
||||
.arg(pageserver_config_args.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("attachment_service")
|
||||
@@ -1321,8 +1376,8 @@ fn cli() -> Command {
|
||||
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg)
|
||||
.arg(timeline_id_arg)
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(lsn_arg)
|
||||
.arg(pg_port_arg)
|
||||
.arg(http_port_arg)
|
||||
@@ -1332,10 +1387,16 @@ fn cli() -> Command {
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
.arg(endpoint_pageserver_id_arg)
|
||||
.arg(endpoint_id_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
.arg(endpoint_id_arg)
|
||||
.arg(tenant_id_arg)
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(
|
||||
Arg::new("destroy")
|
||||
.help("Also delete data directory (now optional, should be default in future)")
|
||||
@@ -1346,6 +1407,18 @@ fn cli() -> Command {
|
||||
)
|
||||
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("mappings")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage neon_local branch name mappings")
|
||||
.subcommand(
|
||||
Command::new("map")
|
||||
.about("Create new mapping which cannot exist already")
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
)
|
||||
)
|
||||
// Obsolete old name for 'endpoint'. We now just print an error if it's used.
|
||||
.subcommand(
|
||||
Command::new("pg")
|
||||
|
||||
@@ -46,7 +46,6 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
@@ -57,13 +56,10 @@ use compute_api::responses::{ComputeState, ComputeStatus};
|
||||
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
|
||||
|
||||
// contents of a endpoint.json file
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EndpointConf {
|
||||
endpoint_id: String,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
timeline_id: TimelineId,
|
||||
mode: ComputeMode,
|
||||
pg_port: u16,
|
||||
@@ -253,7 +249,7 @@ impl Endpoint {
|
||||
conf.append("shared_buffers", "1MB");
|
||||
conf.append("fsync", "off");
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_level", "replica");
|
||||
conf.append("wal_level", "logical");
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
@@ -414,18 +410,34 @@ impl Endpoint {
|
||||
);
|
||||
}
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
|
||||
// TODO use background_process::stop_process instead
|
||||
let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
|
||||
let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
|
||||
let pid = nix::unistd::Pid::from_raw(pid as i32);
|
||||
crate::background_process::wait_until_stopped("compute_ctl", pid)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn read_postgresql_conf(&self) -> Result<String> {
|
||||
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
|
||||
// memory. We will include it in the spec file that we pass to
|
||||
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
|
||||
// in the data directory.
|
||||
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
|
||||
match std::fs::read(&postgresql_conf_path) {
|
||||
Ok(content) => Ok(String::from_utf8(content)?),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok("".to_string()),
|
||||
Err(e) => Err(anyhow::Error::new(e).context(format!(
|
||||
"failed to read config file in {}",
|
||||
postgresql_conf_path.to_str().unwrap()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
@@ -436,21 +448,7 @@ impl Endpoint {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
|
||||
// Slurp the endpoints/<endpoint id>/postgresql.conf file into
|
||||
// memory. We will include it in the spec file that we pass to
|
||||
// `compute_ctl`, and `compute_ctl` will write it to the postgresql.conf
|
||||
// in the data directory.
|
||||
let postgresql_conf_path = self.endpoint_path().join("postgresql.conf");
|
||||
let postgresql_conf = match std::fs::read(&postgresql_conf_path) {
|
||||
Ok(content) => String::from_utf8(content)?,
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => "".to_string(),
|
||||
Err(e) => {
|
||||
return Err(anyhow::Error::new(e).context(format!(
|
||||
"failed to read config file in {}",
|
||||
postgresql_conf_path.to_str().unwrap()
|
||||
)))
|
||||
}
|
||||
};
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
|
||||
// We always start the compute node from scratch, so if the Postgres
|
||||
// data dir exists from a previous launch, remove it first.
|
||||
@@ -621,6 +619,61 @@ impl Endpoint {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
|
||||
let mut spec: ComputeSpec = {
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
let file = std::fs::File::open(spec_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
|
||||
let postgresql_conf = self.read_postgresql_conf()?;
|
||||
spec.cluster.postgresql_conf = Some(postgresql_conf);
|
||||
|
||||
if let Some(pageserver_id) = pageserver_id {
|
||||
let endpoint_config_path = self.endpoint_path().join("endpoint.json");
|
||||
let mut endpoint_conf: EndpointConf = {
|
||||
let file = std::fs::File::open(&endpoint_config_path)?;
|
||||
serde_json::from_reader(file)?
|
||||
};
|
||||
endpoint_conf.pageserver_id = pageserver_id;
|
||||
std::fs::write(
|
||||
endpoint_config_path,
|
||||
serde_json::to_string_pretty(&endpoint_conf)?,
|
||||
)?;
|
||||
|
||||
let pageserver =
|
||||
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
|
||||
let ps_http_conf = &pageserver.pg_connection_config;
|
||||
let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
|
||||
spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
|
||||
}
|
||||
|
||||
let client = reqwest::blocking::Client::new();
|
||||
let response = client
|
||||
.post(format!(
|
||||
"http://{}:{}/configure",
|
||||
self.http_address.ip(),
|
||||
self.http_address.port()
|
||||
))
|
||||
.body(format!(
|
||||
"{{\"spec\":{}}}",
|
||||
serde_json::to_string_pretty(&spec)?
|
||||
))
|
||||
.send()?;
|
||||
|
||||
let status = response.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
Ok(())
|
||||
} else {
|
||||
let url = response.url().to_owned();
|
||||
let msg = match response.text() {
|
||||
Ok(err_body) => format!("Error: {}", err_body),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
};
|
||||
Err(anyhow::anyhow!(msg))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn stop(&self, destroy: bool) -> Result<()> {
|
||||
// If we are going to destroy data directory,
|
||||
// use immediate shutdown mode, otherwise,
|
||||
@@ -629,15 +682,25 @@ impl Endpoint {
|
||||
// Postgres is always started from scratch, so stop
|
||||
// without destroy only used for testing and debugging.
|
||||
//
|
||||
self.pg_ctl(
|
||||
if destroy {
|
||||
&["-m", "immediate", "stop"]
|
||||
} else {
|
||||
&["stop"]
|
||||
},
|
||||
&None,
|
||||
)?;
|
||||
|
||||
// Also wait for the compute_ctl process to die. It might have some cleanup
|
||||
// work to do after postgres stops, like syncing safekeepers, etc.
|
||||
//
|
||||
self.wait_for_compute_ctl_to_exit()?;
|
||||
if destroy {
|
||||
self.pg_ctl(&["-m", "immediate", "stop"], &None)?;
|
||||
println!(
|
||||
"Destroying postgres data directory '{}'",
|
||||
self.pgdata().to_str().unwrap()
|
||||
);
|
||||
std::fs::remove_dir_all(self.endpoint_path())?;
|
||||
} else {
|
||||
self.pg_ctl(&["stop"], &None)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context};
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
@@ -33,7 +32,6 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
|
||||
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
|
||||
// an example.
|
||||
//
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct LocalEnv {
|
||||
// Base directory for all the nodes (the pageserver, safekeepers and
|
||||
@@ -59,7 +57,6 @@ pub struct LocalEnv {
|
||||
// Default tenant ID to use with the 'neon_local' command line utility, when
|
||||
// --tenant_id is not explicitly specified.
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub default_tenant_id: Option<TenantId>,
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
@@ -84,7 +81,6 @@ pub struct LocalEnv {
|
||||
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
#[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
|
||||
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "replica",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -188,11 +188,60 @@ that.
|
||||
|
||||
## Error message style
|
||||
|
||||
### PostgreSQL extensions
|
||||
|
||||
PostgreSQL has a style guide for writing error messages:
|
||||
|
||||
https://www.postgresql.org/docs/current/error-style-guide.html
|
||||
|
||||
Follow that guide when writing error messages in the PostgreSQL
|
||||
extension. We don't follow it strictly in the pageserver and
|
||||
safekeeper, but the advice in the PostgreSQL style guide is generally
|
||||
good, and you can't go wrong by following it.
|
||||
extensions.
|
||||
|
||||
### Neon Rust code
|
||||
|
||||
#### Anyhow Context
|
||||
|
||||
When adding anyhow `context()`, use form `present-tense-verb+action`.
|
||||
|
||||
Example:
|
||||
- Bad: `file.metadata().context("could not get file metadata")?;`
|
||||
- Good: `file.metadata().context("get file metadata")?;`
|
||||
|
||||
#### Logging Errors
|
||||
|
||||
When logging any error `e`, use `could not {e:#}` or `failed to {e:#}`.
|
||||
|
||||
If `e` is an `anyhow` error and you want to log the backtrace that it contains,
|
||||
use `{e:?}` instead of `{e:#}`.
|
||||
|
||||
#### Rationale
|
||||
|
||||
The `{:#}` ("alternate Display") of an `anyhow` error chain is concatenation fo the contexts, using `: `.
|
||||
|
||||
For example, the following Rust code will result in output
|
||||
```
|
||||
ERROR failed to list users: load users from server: parse response: invalid json
|
||||
```
|
||||
|
||||
This is more concise / less noisy than what happens if you do `.context("could not ...")?` at each level, i.e.:
|
||||
|
||||
```
|
||||
ERROR could not list users: could not load users from server: could not parse response: invalid json
|
||||
```
|
||||
|
||||
|
||||
```rust
|
||||
fn main() {
|
||||
match list_users().context("list users") else {
|
||||
Ok(_) => ...,
|
||||
Err(e) => tracing::error!("failed to {e:#}"),
|
||||
}
|
||||
}
|
||||
fn list_users() {
|
||||
http_get_users().context("load users from server")?;
|
||||
}
|
||||
fn http_get_users() {
|
||||
let response = client....?;
|
||||
response.parse().context("parse response")?; // fails with serde error "invalid json"
|
||||
}
|
||||
```
|
||||
|
||||
@@ -96,6 +96,16 @@ prefix_in_bucket = '/test_prefix/'
|
||||
|
||||
`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.
|
||||
|
||||
or
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
container_name = 'some-container-name'
|
||||
container_region = 'us-east'
|
||||
prefix_in_container = '/test-prefix/'
|
||||
```
|
||||
|
||||
`AZURE_STORAGE_ACCOUNT` and `AZURE_STORAGE_ACCESS_KEY` env variables can be used to specify the azure credentials if needed.
|
||||
|
||||
## Repository background tasks
|
||||
|
||||
|
||||
108
docs/updating-postgres.md
Normal file
108
docs/updating-postgres.md
Normal file
@@ -0,0 +1,108 @@
|
||||
# Updating Postgres
|
||||
|
||||
## Minor Versions
|
||||
|
||||
When upgrading to a new minor version of Postgres, please follow these steps:
|
||||
|
||||
_Example: 15.4 is the new minor version to upgrade to from 15.3._
|
||||
|
||||
1. Clone the Neon Postgres repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/postgres.git
|
||||
```
|
||||
|
||||
1. Add the Postgres upstream remote.
|
||||
|
||||
```shell
|
||||
git remote add upstream https://git.postgresql.org/git/postgresql.git
|
||||
```
|
||||
|
||||
1. Create a new branch based on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git checkout -b my-branch REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
1. Tag the last commit on the stable branch you are updating.
|
||||
|
||||
```shell
|
||||
git tag REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Push the new tag to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin REL_15_3_neon
|
||||
```
|
||||
|
||||
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
|
||||
|
||||
1. Rebase the branch you created on the tag and resolve any conflicts.
|
||||
|
||||
```shell
|
||||
git fetch upstream REL_15_4
|
||||
git rebase REL_15_4
|
||||
```
|
||||
|
||||
1. Run the Postgres test suite to make sure our commits have not affected
|
||||
Postgres in a negative way.
|
||||
|
||||
```shell
|
||||
make check
|
||||
# OR
|
||||
meson test -C builddir
|
||||
```
|
||||
|
||||
1. Push your branch to the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push origin my-branch
|
||||
```
|
||||
|
||||
1. Clone the Neon repository if you have not done so already.
|
||||
|
||||
```shell
|
||||
git clone git@github.com:neondatabase/neon.git
|
||||
```
|
||||
|
||||
1. Create a new branch.
|
||||
|
||||
1. Change the `revisions.json` file to point at the HEAD of your Postgres
|
||||
branch.
|
||||
|
||||
1. Update the Git submodule.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch my-branch vendor/postgres-v15
|
||||
git submodule update --remote vendor/postgres-v15
|
||||
```
|
||||
|
||||
1. Run the Neon test suite to make sure that Neon is still good to go on this
|
||||
minor Postgres release.
|
||||
|
||||
```shell
|
||||
./scripts/poetry -k pg15
|
||||
```
|
||||
|
||||
1. Commit your changes.
|
||||
|
||||
1. Create a pull request, and wait for CI to go green.
|
||||
|
||||
1. Force push the rebased Postgres branches into the Neon Postgres repository.
|
||||
|
||||
```shell
|
||||
git push --force origin my-branch:REL_15_STABLE_neon
|
||||
```
|
||||
|
||||
It may require disabling various branch protections.
|
||||
|
||||
1. Update your Neon PR to point at the branches.
|
||||
|
||||
```shell
|
||||
git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
|
||||
git commit --amend --no-edit
|
||||
git push --force origin
|
||||
```
|
||||
|
||||
1. Merge the pull request after getting approval(s) and CI completion.
|
||||
@@ -6,7 +6,6 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -19,7 +18,6 @@ pub type PgIdent = String;
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[serde_as]
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
@@ -50,12 +48,12 @@ pub struct ComputeSpec {
|
||||
// these, and instead set the "neon.tenant_id", "neon.timeline_id",
|
||||
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
|
||||
// updated to fill these fields, we can make these non optional.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub tenant_id: Option<TenantId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
|
||||
pub pageserver_connstring: Option<String>,
|
||||
|
||||
#[serde(default)]
|
||||
pub safekeeper_connstrings: Vec<String>,
|
||||
|
||||
@@ -140,14 +138,13 @@ impl RemoteExtSpec {
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
|
||||
pub enum ComputeMode {
|
||||
/// A read-write node
|
||||
#[default]
|
||||
Primary,
|
||||
/// A read-only node, pinned at a particular LSN
|
||||
Static(#[serde_as(as = "DisplayFromStr")] Lsn),
|
||||
Static(Lsn),
|
||||
/// A read-only node that follows the tip of the branch in hot standby mode
|
||||
///
|
||||
/// Future versions may want to distinguish between replicas with hot standby
|
||||
@@ -190,6 +187,8 @@ pub struct DeltaOp {
|
||||
pub struct Role {
|
||||
pub name: PgIdent,
|
||||
pub encrypted_password: Option<String>,
|
||||
pub replication: Option<bool>,
|
||||
pub bypassrls: Option<bool>,
|
||||
pub options: GenericOptions,
|
||||
}
|
||||
|
||||
@@ -200,6 +199,12 @@ pub struct Database {
|
||||
pub name: PgIdent,
|
||||
pub owner: PgIdent,
|
||||
pub options: GenericOptions,
|
||||
// These are derived flags, not present in the spec file.
|
||||
// They are never set by the control plane.
|
||||
#[serde(skip_deserializing, default)]
|
||||
pub restrict_conn: bool,
|
||||
#[serde(skip_deserializing, default)]
|
||||
pub invalid: bool,
|
||||
}
|
||||
|
||||
/// Common type representing both SQL statement params with or without value,
|
||||
|
||||
@@ -76,7 +76,7 @@
|
||||
},
|
||||
{
|
||||
"name": "wal_level",
|
||||
"value": "replica",
|
||||
"value": "logical",
|
||||
"vartype": "enum"
|
||||
},
|
||||
{
|
||||
|
||||
@@ -89,14 +89,14 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
|
||||
0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
|
||||
];
|
||||
|
||||
pub fn set_build_info_metric(revision: &str) {
|
||||
pub fn set_build_info_metric(revision: &str, build_tag: &str) {
|
||||
let metric = register_int_gauge_vec!(
|
||||
"libmetrics_build_info",
|
||||
"Build/version information",
|
||||
&["revision"]
|
||||
&["revision", "build_tag"]
|
||||
)
|
||||
.expect("Failed to register build info metric");
|
||||
metric.with_label_values(&[revision]).set(1);
|
||||
metric.with_label_values(&[revision, build_tag]).set(1);
|
||||
}
|
||||
|
||||
// Records I/O stats in a "cross-platform" way.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::io::{Read, Result, Write};
|
||||
|
||||
/// A wrapper for an object implementing [Read](std::io::Read)
|
||||
/// A wrapper for an object implementing [Read]
|
||||
/// which allows a closure to observe the amount of bytes read.
|
||||
/// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
|
||||
///
|
||||
@@ -51,17 +51,17 @@ impl<'a, T> CountedReader<'a, T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the underlying [Read](std::io::Read) implementor
|
||||
/// Get an immutable reference to the underlying [Read] implementor
|
||||
pub fn inner(&self) -> &T {
|
||||
&self.reader
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the underlying [Read](std::io::Read) implementor
|
||||
/// Get a mutable reference to the underlying [Read] implementor
|
||||
pub fn inner_mut(&mut self) -> &mut T {
|
||||
&mut self.reader
|
||||
}
|
||||
|
||||
/// Consume the wrapper and return the underlying [Read](std::io::Read) implementor
|
||||
/// Consume the wrapper and return the underlying [Read] implementor
|
||||
pub fn into_inner(self) -> T {
|
||||
self.reader
|
||||
}
|
||||
@@ -75,7 +75,7 @@ impl<T: Read> Read for CountedReader<'_, T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper for an object implementing [Write](std::io::Write)
|
||||
/// A wrapper for an object implementing [Write]
|
||||
/// which allows a closure to observe the amount of bytes written.
|
||||
/// This is useful in conjunction with metrics (e.g. [IntCounter](crate::IntCounter)).
|
||||
///
|
||||
@@ -122,17 +122,17 @@ impl<'a, T> CountedWriter<'a, T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get an immutable reference to the underlying [Write](std::io::Write) implementor
|
||||
/// Get an immutable reference to the underlying [Write] implementor
|
||||
pub fn inner(&self) -> &T {
|
||||
&self.writer
|
||||
}
|
||||
|
||||
/// Get a mutable reference to the underlying [Write](std::io::Write) implementor
|
||||
/// Get a mutable reference to the underlying [Write] implementor
|
||||
pub fn inner_mut(&mut self) -> &mut T {
|
||||
&mut self.writer
|
||||
}
|
||||
|
||||
/// Consume the wrapper and return the underlying [Write](std::io::Write) implementor
|
||||
/// Consume the wrapper and return the underlying [Write] implementor
|
||||
pub fn into_inner(self) -> T {
|
||||
self.writer
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
//! See docs/rfcs/025-generation-numbers.md
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::id::{NodeId, TenantId};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -12,12 +11,10 @@ pub struct ReAttachRequest {
|
||||
pub node_id: NodeId,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ReAttachResponseTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub generation: u32,
|
||||
pub gen: u32,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -25,10 +22,8 @@ pub struct ReAttachResponse {
|
||||
pub tenants: Vec<ReAttachResponseTenant>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateRequestTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub gen: u32,
|
||||
}
|
||||
@@ -43,10 +38,8 @@ pub struct ValidateResponse {
|
||||
pub tenants: Vec<ValidateResponseTenant>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct ValidateResponseTenant {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
pub valid: bool,
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use serde_with::serde_as;
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
@@ -110,7 +110,6 @@ impl TenantState {
|
||||
// So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
|
||||
Self::Attaching | Self::Activating(ActivatingFrom::Attaching) => Maybe,
|
||||
// tenant mgr startup distinguishes attaching from loading via marker file.
|
||||
// If it's loading, there is no attach marker file, i.e., attach had finished in the past.
|
||||
Self::Loading | Self::Activating(ActivatingFrom::Loading) => Attached,
|
||||
// We only reach Active after successful load / attach.
|
||||
// So, call atttachment status Attached.
|
||||
@@ -175,25 +174,19 @@ pub enum TimelineState {
|
||||
Broken { reason: String, backtrace: String },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub new_timeline_id: TimelineId,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_start_lsn: Option<Lsn>,
|
||||
pub pg_version: Option<u32>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub new_tenant_id: TenantId,
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
@@ -202,7 +195,6 @@ pub struct TenantCreateRequest {
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLoadRequest {
|
||||
@@ -279,31 +271,26 @@ pub struct LocationConfig {
|
||||
pub tenant_conf: TenantConfig,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
|
||||
pub struct TenantCreateResponse(pub TenantId);
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct StatusResponse {
|
||||
pub id: NodeId,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantConfigRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(flatten)]
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
@@ -375,10 +362,8 @@ pub enum TenantAttachmentStatus {
|
||||
Failed { reason: String },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: TenantId,
|
||||
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
|
||||
pub state: TenantState,
|
||||
@@ -389,33 +374,22 @@ pub struct TenantInfo {
|
||||
}
|
||||
|
||||
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_timeline_id: Option<TimelineId>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub last_record_lsn: Lsn,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub prev_record_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub latest_gc_cutoff_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
|
||||
/// The LSN that we have succesfully uploaded to remote storage
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
|
||||
/// The LSN that we are advertizing to safekeepers
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn_visible: Lsn,
|
||||
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
@@ -427,7 +401,6 @@ pub struct TimelineInfo {
|
||||
pub timeline_dir_layer_file_size_sum: Option<u64>,
|
||||
|
||||
pub wal_source_connstr: Option<String>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub last_received_msg_lsn: Option<Lsn>,
|
||||
/// the timestamp (in microseconds) of the last received message
|
||||
pub last_received_msg_ts: Option<u128>,
|
||||
@@ -524,23 +497,13 @@ pub struct LayerAccessStats {
|
||||
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum InMemoryLayerInfo {
|
||||
Open {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
},
|
||||
Frozen {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_end: Lsn,
|
||||
},
|
||||
Open { lsn_start: Lsn },
|
||||
Frozen { lsn_start: Lsn, lsn_end: Lsn },
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum HistoricLayerInfo {
|
||||
@@ -548,9 +511,7 @@ pub enum HistoricLayerInfo {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_end: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
@@ -559,7 +520,6 @@ pub enum HistoricLayerInfo {
|
||||
layer_file_name: String,
|
||||
layer_file_size: u64,
|
||||
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
lsn_start: Lsn,
|
||||
remote: bool,
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
@@ -22,9 +22,9 @@ use postgres_ffi::Oid;
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
|
||||
// Then we could replace the custo Ord and PartialOrd implementations below with
|
||||
// deriving them.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
// Then we could replace the custom Ord and PartialOrd implementations below with
|
||||
// deriving them. This will require changes in walredoproc.c.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
@@ -40,21 +40,9 @@ impl PartialOrd for RelTag {
|
||||
|
||||
impl Ord for RelTag {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp = self.spcnode.cmp(&other.spcnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.dbnode.cmp(&other.dbnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.relnode.cmp(&other.relnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.forknum.cmp(&other.forknum);
|
||||
|
||||
cmp
|
||||
// Custom ordering where we put forknum to the end of the list
|
||||
let other_tup = (other.spcnode, other.dbnode, other.relnode, other.forknum);
|
||||
(self.spcnode, self.dbnode, self.relnode, self.forknum).cmp(&other_tup)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -19,8 +19,8 @@ use tracing::{debug, error, info, trace};
|
||||
|
||||
use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
|
||||
use pq_proto::{
|
||||
BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_INTERNAL_ERROR,
|
||||
SQLSTATE_SUCCESSFUL_COMPLETION,
|
||||
BeMessage, FeMessage, FeStartupPacket, ProtocolError, SQLSTATE_ADMIN_SHUTDOWN,
|
||||
SQLSTATE_INTERNAL_ERROR, SQLSTATE_SUCCESSFUL_COMPLETION,
|
||||
};
|
||||
|
||||
/// An error, occurred during query processing:
|
||||
@@ -30,6 +30,9 @@ pub enum QueryError {
|
||||
/// The connection was lost while processing the query.
|
||||
#[error(transparent)]
|
||||
Disconnected(#[from] ConnectionError),
|
||||
/// We were instructed to shutdown while processing the query
|
||||
#[error("Shutting down")]
|
||||
Shutdown,
|
||||
/// Some other error
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
@@ -44,7 +47,8 @@ impl From<io::Error> for QueryError {
|
||||
impl QueryError {
|
||||
pub fn pg_error_code(&self) -> &'static [u8; 5] {
|
||||
match self {
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Disconnected(_) => b"08006", // connection failure
|
||||
Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
|
||||
Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
|
||||
}
|
||||
}
|
||||
@@ -238,6 +242,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation safe as long as the underlying IO is cancellation safe.
|
||||
async fn shutdown(&mut self) -> io::Result<()> {
|
||||
match self {
|
||||
MaybeWriteOnly::Full(framed) => framed.shutdown().await,
|
||||
@@ -389,14 +394,37 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
shutdown_watcher: F,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
F: Fn() -> S,
|
||||
F: Fn() -> S + Clone,
|
||||
S: Future,
|
||||
{
|
||||
let ret = self.run_message_loop(handler, shutdown_watcher).await;
|
||||
// socket might be already closed, e.g. if previously received error,
|
||||
// so ignore result.
|
||||
self.framed.shutdown().await.ok();
|
||||
ret
|
||||
let ret = self
|
||||
.run_message_loop(handler, shutdown_watcher.clone())
|
||||
.await;
|
||||
|
||||
tokio::select! {
|
||||
_ = shutdown_watcher() => {
|
||||
// do nothing; we most likely got already stopped by shutdown and will log it next.
|
||||
}
|
||||
_ = self.framed.shutdown() => {
|
||||
// socket might be already closed, e.g. if previously received error,
|
||||
// so ignore result.
|
||||
},
|
||||
}
|
||||
|
||||
match ret {
|
||||
Ok(()) => Ok(()),
|
||||
Err(QueryError::Shutdown) => {
|
||||
info!("Stopped due to shutdown");
|
||||
Ok(())
|
||||
}
|
||||
Err(QueryError::Disconnected(e)) => {
|
||||
info!("Disconnected ({e:#})");
|
||||
// Disconnection is not an error: we just use it that way internally to drop
|
||||
// out of loops.
|
||||
Ok(())
|
||||
}
|
||||
e => e,
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_message_loop<F, S>(
|
||||
@@ -416,15 +444,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during handshake");
|
||||
return Ok(())
|
||||
return Err(QueryError::Shutdown)
|
||||
},
|
||||
|
||||
result = self.handshake(handler) => {
|
||||
// Handshake complete.
|
||||
result?;
|
||||
if self.state == ProtoState::Closed {
|
||||
return Ok(()); // EOF during handshake
|
||||
}
|
||||
handshake_r = self.handshake(handler) => {
|
||||
handshake_r?;
|
||||
}
|
||||
);
|
||||
|
||||
@@ -435,17 +459,34 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received in run_message_loop");
|
||||
Ok(None)
|
||||
return Err(QueryError::Shutdown)
|
||||
},
|
||||
msg = self.read_message() => { msg },
|
||||
)? {
|
||||
trace!("got message {:?}", msg);
|
||||
|
||||
let result = self.process_message(handler, msg, &mut query_string).await;
|
||||
self.flush().await?;
|
||||
tokio::select!(
|
||||
biased;
|
||||
_ = shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
tracing::info!("shutdown request received during response flush");
|
||||
|
||||
// If we exited process_message with a shutdown error, there may be
|
||||
// some valid response content on in our transmit buffer: permit sending
|
||||
// this within a short timeout. This is a best effort thing so we don't
|
||||
// care about the result.
|
||||
tokio::time::timeout(std::time::Duration::from_millis(500), self.flush()).await.ok();
|
||||
|
||||
return Err(QueryError::Shutdown)
|
||||
},
|
||||
flush_r = self.flush() => {
|
||||
flush_r?;
|
||||
}
|
||||
);
|
||||
|
||||
match result? {
|
||||
ProcessMsgResult::Continue => {
|
||||
self.flush().await?;
|
||||
continue;
|
||||
}
|
||||
ProcessMsgResult::Break => break,
|
||||
@@ -550,7 +591,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
self.peer_addr
|
||||
);
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(());
|
||||
return Err(QueryError::Disconnected(ConnectionError::Protocol(
|
||||
ProtocolError::Protocol("EOF during handshake".to_string()),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -589,7 +632,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
self.peer_addr
|
||||
);
|
||||
self.state = ProtoState::Closed;
|
||||
return Ok(());
|
||||
return Err(QueryError::Disconnected(ConnectionError::Protocol(
|
||||
ProtocolError::Protocol("EOF during auth".to_string()),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -683,12 +728,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
|
||||
|
||||
trace!("got query {query_string:?}");
|
||||
if let Err(e) = handler.process_query(self, query_string).await {
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
match e {
|
||||
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
|
||||
e => {
|
||||
log_query_error(query_string, &e);
|
||||
let short_error = short_error(&e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&short_error,
|
||||
Some(e.pg_error_code()),
|
||||
))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
|
||||
}
|
||||
@@ -913,6 +963,7 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
|
||||
pub fn short_error(e: &QueryError) -> String {
|
||||
match e {
|
||||
QueryError::Disconnected(connection_error) => connection_error.to_string(),
|
||||
QueryError::Shutdown => "shutdown".to_string(),
|
||||
QueryError::Other(e) => format!("{e:#}"),
|
||||
}
|
||||
}
|
||||
@@ -929,6 +980,9 @@ fn log_query_error(query: &str, e: &QueryError) {
|
||||
QueryError::Disconnected(other_connection_error) => {
|
||||
error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
|
||||
}
|
||||
QueryError::Shutdown => {
|
||||
info!("query handler for '{query}' cancelled during tenant shutdown")
|
||||
}
|
||||
QueryError::Other(e) => {
|
||||
error!("query handler for '{query}' failed: {e:?}");
|
||||
}
|
||||
|
||||
@@ -131,6 +131,7 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
// Export some version independent functions that are used outside of this mod
|
||||
pub use v14::xlog_utils::encode_logical_message;
|
||||
pub use v14::xlog_utils::from_pg_timestamp;
|
||||
pub use v14::xlog_utils::get_current_timestamp;
|
||||
pub use v14::xlog_utils::to_pg_timestamp;
|
||||
pub use v14::xlog_utils::XLogFileName;
|
||||
|
||||
@@ -220,6 +220,10 @@ pub const XLOG_CHECKPOINT_ONLINE: u8 = 0x10;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_LONG_HEADER: u16 = 0x0002;
|
||||
|
||||
/* From replication/slot.h */
|
||||
pub const REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN: usize = 4*4 /* offset of `slotdata` in ReplicationSlotOnDisk */
|
||||
+ 64 /* NameData */ + 4*4;
|
||||
|
||||
/* From fsm_internals.h */
|
||||
const FSM_NODES_PER_PAGE: usize = BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA - 4;
|
||||
const FSM_NON_LEAF_NODES_PER_PAGE: usize = BLCKSZ as usize / 2 - 1;
|
||||
|
||||
@@ -136,21 +136,42 @@ pub fn get_current_timestamp() -> TimestampTz {
|
||||
to_pg_timestamp(SystemTime::now())
|
||||
}
|
||||
|
||||
pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; /* == date2j(1970, 1, 1) */
|
||||
const POSTGRES_EPOCH_JDATE: u64 = 2451545; /* == date2j(2000, 1, 1) */
|
||||
// Module to reduce the scope of the constants
|
||||
mod timestamp_conversions {
|
||||
use std::time::Duration;
|
||||
|
||||
use super::*;
|
||||
|
||||
const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1)
|
||||
const POSTGRES_EPOCH_JDATE: u64 = 2451545; // == date2j(2000, 1, 1)
|
||||
const SECS_PER_DAY: u64 = 86400;
|
||||
const USECS_PER_SEC: u64 = 1000000;
|
||||
match time.duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => {
|
||||
((n.as_secs() - ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY))
|
||||
* USECS_PER_SEC
|
||||
+ n.subsec_micros() as u64) as i64
|
||||
const SECS_DIFF_UNIX_TO_POSTGRES_EPOCH: u64 =
|
||||
(POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY;
|
||||
|
||||
pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
|
||||
match time.duration_since(SystemTime::UNIX_EPOCH) {
|
||||
Ok(n) => {
|
||||
((n.as_secs() - SECS_DIFF_UNIX_TO_POSTGRES_EPOCH) * USECS_PER_SEC
|
||||
+ n.subsec_micros() as u64) as i64
|
||||
}
|
||||
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
|
||||
}
|
||||
Err(_) => panic!("SystemTime before UNIX EPOCH!"),
|
||||
}
|
||||
|
||||
pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime {
|
||||
let time: u64 = time
|
||||
.try_into()
|
||||
.expect("timestamp before millenium (postgres epoch)");
|
||||
let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC;
|
||||
SystemTime::UNIX_EPOCH
|
||||
.checked_add(Duration::from_micros(since_unix_epoch))
|
||||
.expect("SystemTime overflow")
|
||||
}
|
||||
}
|
||||
|
||||
pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp};
|
||||
|
||||
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
|
||||
// start_lsn must point to some previously known record boundary (beginning of
|
||||
// the next record). If no valid record after is found, start_lsn is returned
|
||||
@@ -481,4 +502,24 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
|
||||
wal
|
||||
}
|
||||
|
||||
// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_ts_conversion() {
|
||||
let now = SystemTime::now();
|
||||
let round_trip = from_pg_timestamp(to_pg_timestamp(now));
|
||||
|
||||
let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||
let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap();
|
||||
assert_eq!(now_since.as_micros(), round_trip_since.as_micros());
|
||||
|
||||
let now_pg = get_current_timestamp();
|
||||
let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg));
|
||||
|
||||
assert_eq!(now_pg, round_trip_pg);
|
||||
}
|
||||
|
||||
// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
|
||||
}
|
||||
|
||||
@@ -14,6 +14,7 @@ macro_rules! xlog_utils_test {
|
||||
($version:ident) => {
|
||||
#[path = "."]
|
||||
mod $version {
|
||||
#[allow(unused_imports)]
|
||||
pub use postgres_ffi::$version::wal_craft_test_export::*;
|
||||
#[allow(clippy::duplicate_mod)]
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -214,27 +214,24 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Cancellation safe as long as the AsyncWrite is cancellation safe.
|
||||
async fn flush<S: AsyncWrite + Unpin>(
|
||||
stream: &mut S,
|
||||
write_buf: &mut BytesMut,
|
||||
) -> Result<(), io::Error> {
|
||||
while write_buf.has_remaining() {
|
||||
let bytes_written = stream.write(write_buf.chunk()).await?;
|
||||
let bytes_written = stream.write_buf(write_buf).await?;
|
||||
if bytes_written == 0 {
|
||||
return Err(io::Error::new(
|
||||
ErrorKind::WriteZero,
|
||||
"failed to write message",
|
||||
));
|
||||
}
|
||||
// The advanced part will be garbage collected, likely during shifting
|
||||
// data left on next attempt to write to buffer when free space is not
|
||||
// enough.
|
||||
write_buf.advance(bytes_written);
|
||||
}
|
||||
write_buf.clear();
|
||||
stream.flush().await
|
||||
}
|
||||
|
||||
/// Cancellation safe as long as the AsyncWrite is cancellation safe.
|
||||
async fn shutdown<S: AsyncWrite + Unpin>(
|
||||
stream: &mut S,
|
||||
write_buf: &mut BytesMut,
|
||||
|
||||
@@ -670,6 +670,7 @@ pub fn read_cstr(buf: &mut Bytes) -> Result<Bytes, ProtocolError> {
|
||||
}
|
||||
|
||||
pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
|
||||
pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
|
||||
pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
|
||||
|
||||
impl<'a> BeMessage<'a> {
|
||||
|
||||
@@ -13,6 +13,7 @@ aws-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
aws-credential-types.workspace = true
|
||||
bytes.workspace = true
|
||||
camino.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
serde.workspace = true
|
||||
@@ -26,6 +27,13 @@ metrics.workspace = true
|
||||
utils.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
azure_core.workspace = true
|
||||
azure_identity.workspace = true
|
||||
azure_storage.workspace = true
|
||||
azure_storage_blobs.workspace = true
|
||||
futures-util.workspace = true
|
||||
http-types.workspace = true
|
||||
itertools.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
camino-tempfile.workspace = true
|
||||
|
||||
314
libs/remote_storage/src/azure_blob.rs
Normal file
314
libs/remote_storage/src/azure_blob.rs
Normal file
@@ -0,0 +1,314 @@
|
||||
//! Azure Blob Storage wrapper
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::num::NonZeroU32;
|
||||
use std::sync::Arc;
|
||||
use std::{borrow::Cow, io::Cursor};
|
||||
|
||||
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
|
||||
use anyhow::Result;
|
||||
use azure_core::request_options::{MaxResults, Metadata, Range};
|
||||
use azure_identity::DefaultAzureCredential;
|
||||
use azure_storage::StorageCredentials;
|
||||
use azure_storage_blobs::prelude::ClientBuilder;
|
||||
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
|
||||
use futures_util::StreamExt;
|
||||
use http_types::StatusCode;
|
||||
use tokio::io::AsyncRead;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::s3_bucket::RequestKind;
|
||||
use crate::{
|
||||
AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
|
||||
RemoteStorage, StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct AzureBlobStorage {
|
||||
client: ContainerClient,
|
||||
prefix_in_container: Option<String>,
|
||||
max_keys_per_list_response: Option<NonZeroU32>,
|
||||
concurrency_limiter: ConcurrencyLimiter,
|
||||
}
|
||||
|
||||
impl AzureBlobStorage {
|
||||
pub fn new(azure_config: &AzureConfig) -> Result<Self> {
|
||||
debug!(
|
||||
"Creating azure remote storage for azure container {}",
|
||||
azure_config.container_name
|
||||
);
|
||||
|
||||
let account = env::var("AZURE_STORAGE_ACCOUNT").expect("missing AZURE_STORAGE_ACCOUNT");
|
||||
|
||||
// If the `AZURE_STORAGE_ACCESS_KEY` env var has an access key, use that,
|
||||
// otherwise try the token based credentials.
|
||||
let credentials = if let Ok(access_key) = env::var("AZURE_STORAGE_ACCESS_KEY") {
|
||||
StorageCredentials::access_key(account.clone(), access_key)
|
||||
} else {
|
||||
let token_credential = DefaultAzureCredential::default();
|
||||
StorageCredentials::token_credential(Arc::new(token_credential))
|
||||
};
|
||||
|
||||
let builder = ClientBuilder::new(account, credentials);
|
||||
|
||||
let client = builder.container_client(azure_config.container_name.to_owned());
|
||||
|
||||
let max_keys_per_list_response =
|
||||
if let Some(limit) = azure_config.max_keys_per_list_response {
|
||||
Some(
|
||||
NonZeroU32::new(limit as u32)
|
||||
.ok_or_else(|| anyhow::anyhow!("max_keys_per_list_response can't be 0"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(AzureBlobStorage {
|
||||
client,
|
||||
prefix_in_container: azure_config.prefix_in_container.to_owned(),
|
||||
max_keys_per_list_response,
|
||||
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn relative_path_to_name(&self, path: &RemotePath) -> String {
|
||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
let path_string = path
|
||||
.get_path()
|
||||
.as_str()
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
match &self.prefix_in_container {
|
||||
Some(prefix) => {
|
||||
if prefix.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
prefix.clone() + path_string
|
||||
} else {
|
||||
format!("{prefix}{REMOTE_STORAGE_PREFIX_SEPARATOR}{path_string}")
|
||||
}
|
||||
}
|
||||
None => path_string.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn name_to_relative_path(&self, key: &str) -> RemotePath {
|
||||
let relative_path =
|
||||
match key.strip_prefix(self.prefix_in_container.as_deref().unwrap_or_default()) {
|
||||
Some(stripped) => stripped,
|
||||
// we rely on Azure to return properly prefixed paths
|
||||
// for requests with a certain prefix
|
||||
None => panic!(
|
||||
"Key {key} does not start with container prefix {:?}",
|
||||
self.prefix_in_container
|
||||
),
|
||||
};
|
||||
RemotePath(
|
||||
relative_path
|
||||
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
async fn download_for_builder(
|
||||
&self,
|
||||
builder: GetBlobBuilder,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let mut response = builder.into_stream();
|
||||
|
||||
let mut metadata = HashMap::new();
|
||||
// TODO give proper streaming response instead of buffering into RAM
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::new();
|
||||
while let Some(part) = response.next().await {
|
||||
let part = part.map_err(to_download_error)?;
|
||||
if let Some(blob_meta) = part.blob.metadata {
|
||||
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
|
||||
}
|
||||
let data = part
|
||||
.data
|
||||
.collect()
|
||||
.await
|
||||
.map_err(|e| DownloadError::Other(e.into()))?;
|
||||
buf.extend_from_slice(&data.slice(..));
|
||||
}
|
||||
Ok(Download {
|
||||
download_stream: Box::pin(Cursor::new(buf)),
|
||||
metadata: Some(StorageMetadata(metadata)),
|
||||
})
|
||||
}
|
||||
|
||||
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
|
||||
self.concurrency_limiter
|
||||
.acquire(kind)
|
||||
.await
|
||||
.expect("semaphore is never closed")
|
||||
}
|
||||
}
|
||||
|
||||
fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
|
||||
let mut res = Metadata::new();
|
||||
for (k, v) in metadata.0.into_iter() {
|
||||
res.insert(k, v);
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
fn to_download_error(error: azure_core::Error) -> DownloadError {
|
||||
if let Some(http_err) = error.as_http_error() {
|
||||
match http_err.status() {
|
||||
StatusCode::NotFound => DownloadError::NotFound,
|
||||
StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
|
||||
_ => DownloadError::Other(anyhow::Error::new(error)),
|
||||
}
|
||||
} else {
|
||||
DownloadError::Other(error.into())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for AzureBlobStorage {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_name(p))
|
||||
.or_else(|| self.prefix_in_container.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut builder = self.client.list_blobs();
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
if let Some(prefix) = list_prefix {
|
||||
builder = builder.prefix(Cow::from(prefix.to_owned()));
|
||||
}
|
||||
|
||||
if let Some(limit) = self.max_keys_per_list_response {
|
||||
builder = builder.max_results(MaxResults::new(limit));
|
||||
}
|
||||
|
||||
let mut response = builder.into_stream();
|
||||
let mut res = Listing::default();
|
||||
while let Some(l) = response.next().await {
|
||||
let entry = l.map_err(to_download_error)?;
|
||||
let prefix_iter = entry
|
||||
.blobs
|
||||
.prefixes()
|
||||
.map(|prefix| self.name_to_relative_path(&prefix.name));
|
||||
res.prefixes.extend(prefix_iter);
|
||||
|
||||
let blob_iter = entry
|
||||
.blobs
|
||||
.blobs()
|
||||
.map(|k| self.name_to_relative_path(&k.name));
|
||||
res.keys.extend(blob_iter);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
|
||||
data_size_bytes: usize,
|
||||
to: &RemotePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Put).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(to));
|
||||
|
||||
// TODO FIX THIS UGLY HACK and don't buffer the entire object
|
||||
// into RAM here, but use the streaming interface. For that,
|
||||
// we'd have to change the interface though...
|
||||
// https://github.com/neondatabase/neon/issues/5563
|
||||
let mut buf = Vec::with_capacity(data_size_bytes);
|
||||
tokio::io::copy(&mut from, &mut buf).await?;
|
||||
let body = azure_core::Body::Bytes(buf.into());
|
||||
|
||||
let mut builder = blob_client.put_block_blob(body);
|
||||
|
||||
if let Some(metadata) = metadata {
|
||||
builder = builder.metadata(to_azure_metadata(metadata));
|
||||
}
|
||||
|
||||
let _response = builder.into_future().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let builder = blob_client.get();
|
||||
|
||||
self.download_for_builder(builder).await
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &RemotePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError> {
|
||||
let _permit = self.permit(RequestKind::Get).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
|
||||
|
||||
let mut builder = blob_client.get();
|
||||
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
||||
} else {
|
||||
// Open ranges are not supported by the SDK so we work around
|
||||
// by setting the upper limit extremely high (but high enough
|
||||
// to still be representable by signed 64 bit integers).
|
||||
// TODO remove workaround once the SDK adds open range support
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1438
|
||||
let end_exclusive = u64::MAX / 4;
|
||||
builder = builder.range(Range::new(start_inclusive, end_exclusive));
|
||||
}
|
||||
|
||||
self.download_for_builder(builder).await
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let _permit = self.permit(RequestKind::Delete).await;
|
||||
let blob_client = self.client.blob_client(self.relative_path_to_name(path));
|
||||
|
||||
let builder = blob_client.delete();
|
||||
|
||||
match builder.into_future().await {
|
||||
Ok(_response) => Ok(()),
|
||||
Err(e) => {
|
||||
if let Some(http_err) = e.as_http_error() {
|
||||
if http_err.status() == StatusCode::NotFound {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
Err(anyhow::Error::new(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
// Permit is already obtained by inner delete function
|
||||
|
||||
// TODO batch requests are also not supported by the SDK
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1068
|
||||
// https://github.com/Azure/azure-sdk-for-rust/issues/1249
|
||||
for path in paths {
|
||||
self.delete(path).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -4,7 +4,10 @@
|
||||
//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||
//! * [`azure_blob`] allows to use Azure Blob storage as an external storage
|
||||
//!
|
||||
|
||||
mod azure_blob;
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
mod simulate_failures;
|
||||
@@ -21,11 +24,15 @@ use anyhow::{bail, Context};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io;
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use toml_edit::Item;
|
||||
use tracing::info;
|
||||
|
||||
pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};
|
||||
pub use self::{
|
||||
azure_blob::AzureBlobStorage, local_fs::LocalFs, s3_bucket::S3Bucket,
|
||||
simulate_failures::UnreliableWrapper,
|
||||
};
|
||||
use s3_bucket::RequestKind;
|
||||
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
@@ -39,6 +46,11 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// We set this a little bit low as we currently buffer the entire file into RAM
|
||||
///
|
||||
/// Here, a limit of max 20k concurrent connections was noted.
|
||||
/// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
|
||||
pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
@@ -117,6 +129,22 @@ impl RemotePath {
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||
/// whether listings will use a '/' separator or not.
|
||||
///
|
||||
/// The WithDelimiter mode will populate `prefixes` and `keys` in the result. The
|
||||
/// NoDelimiter mode will only populate `keys`.
|
||||
pub enum ListingMode {
|
||||
WithDelimiter,
|
||||
NoDelimiter,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Listing {
|
||||
pub prefixes: Vec<RemotePath>,
|
||||
pub keys: Vec<RemotePath>,
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
@@ -129,8 +157,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError>;
|
||||
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
}
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
@@ -142,7 +175,16 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
|
||||
async fn list_files(&self, prefix: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let result = self.list(prefix, ListingMode::NoDelimiter).await?.keys;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
_mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
@@ -193,6 +235,9 @@ pub enum DownloadError {
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// A cancellation token aborted the download, typically during
|
||||
/// tenant detach or process shutdown.
|
||||
Cancelled,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
@@ -203,6 +248,7 @@ impl std::fmt::Display for DownloadError {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
|
||||
}
|
||||
@@ -217,10 +263,24 @@ impl std::error::Error for DownloadError {}
|
||||
pub enum GenericRemoteStorage {
|
||||
LocalFs(LocalFs),
|
||||
AwsS3(Arc<S3Bucket>),
|
||||
AzureBlob(Arc<AzureBlobStorage>),
|
||||
Unreliable(Arc<UnreliableWrapper>),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> anyhow::Result<Listing, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list(prefix, mode).await,
|
||||
Self::AwsS3(s) => s.list(prefix, mode).await,
|
||||
Self::AzureBlob(s) => s.list(prefix, mode).await,
|
||||
Self::Unreliable(s) => s.list(prefix, mode).await,
|
||||
}
|
||||
}
|
||||
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
@@ -228,6 +288,7 @@ impl GenericRemoteStorage {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
@@ -242,6 +303,7 @@ impl GenericRemoteStorage {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix).await,
|
||||
}
|
||||
}
|
||||
@@ -256,6 +318,7 @@ impl GenericRemoteStorage {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
|
||||
}
|
||||
}
|
||||
@@ -264,6 +327,7 @@ impl GenericRemoteStorage {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.download(from).await,
|
||||
Self::AwsS3(s) => s.download(from).await,
|
||||
Self::AzureBlob(s) => s.download(from).await,
|
||||
Self::Unreliable(s) => s.download(from).await,
|
||||
}
|
||||
}
|
||||
@@ -283,6 +347,10 @@ impl GenericRemoteStorage {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.await
|
||||
}
|
||||
Self::AzureBlob(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.await
|
||||
}
|
||||
Self::Unreliable(s) => {
|
||||
s.download_byte_range(from, start_inclusive, end_exclusive)
|
||||
.await
|
||||
@@ -294,6 +362,7 @@ impl GenericRemoteStorage {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete(path).await,
|
||||
Self::AwsS3(s) => s.delete(path).await,
|
||||
Self::AzureBlob(s) => s.delete(path).await,
|
||||
Self::Unreliable(s) => s.delete(path).await,
|
||||
}
|
||||
}
|
||||
@@ -302,6 +371,7 @@ impl GenericRemoteStorage {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete_objects(paths).await,
|
||||
Self::AwsS3(s) => s.delete_objects(paths).await,
|
||||
Self::AzureBlob(s) => s.delete_objects(paths).await,
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
}
|
||||
}
|
||||
@@ -319,6 +389,11 @@ impl GenericRemoteStorage {
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
|
||||
}
|
||||
RemoteStorageKind::AzureContainer(azure_config) => {
|
||||
info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
|
||||
azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
|
||||
Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -383,6 +458,9 @@ pub enum RemoteStorageKind {
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
/// Azure Blob based storage, storing all files in the container
|
||||
/// specified by the config
|
||||
AzureContainer(AzureConfig),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
@@ -422,11 +500,45 @@ impl Debug for S3Config {
|
||||
}
|
||||
}
|
||||
|
||||
/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct AzureConfig {
|
||||
/// Name of the container to connect to.
|
||||
pub container_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub container_region: String,
|
||||
/// A "subfolder" in the container, to use the same container separately by multiple remote storage users at once.
|
||||
pub prefix_in_container: Option<String>,
|
||||
/// Azure has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
}
|
||||
|
||||
impl Debug for AzureConfig {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("AzureConfig")
|
||||
.field("bucket_name", &self.container_name)
|
||||
.field("bucket_region", &self.container_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_container)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
let bucket_region = toml.get("bucket_region");
|
||||
let container_name = toml.get("container_name");
|
||||
let container_region = toml.get("container_region");
|
||||
|
||||
let use_azure = container_name.is_some() && container_region.is_some();
|
||||
|
||||
let max_concurrent_syncs = NonZeroUsize::new(
|
||||
parse_optional_integer("max_concurrent_syncs", toml)?
|
||||
@@ -440,9 +552,13 @@ impl RemoteStorageConfig {
|
||||
)
|
||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
||||
|
||||
let default_concurrency_limit = if use_azure {
|
||||
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
|
||||
} else {
|
||||
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
|
||||
};
|
||||
let concurrency_limit = NonZeroUsize::new(
|
||||
parse_optional_integer("concurrency_limit", toml)?
|
||||
.unwrap_or(DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
||||
parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
|
||||
@@ -451,33 +567,70 @@ impl RemoteStorageConfig {
|
||||
.context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
|
||||
.or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
|
||||
|
||||
let storage = match (local_path, bucket_name, bucket_region) {
|
||||
let endpoint = toml
|
||||
.get("endpoint")
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?;
|
||||
|
||||
let storage = match (
|
||||
local_path,
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
container_name,
|
||||
container_region,
|
||||
) {
|
||||
// no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
|
||||
(None, None, None) => return Ok(None),
|
||||
(_, Some(_), None) => {
|
||||
(None, None, None, None, None) => return Ok(None),
|
||||
(_, Some(_), None, ..) => {
|
||||
bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
|
||||
}
|
||||
(_, None, Some(_)) => {
|
||||
(_, None, Some(_), ..) => {
|
||||
bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
|
||||
}
|
||||
(None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
||||
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
||||
prefix_in_bucket: toml
|
||||
.get("prefix_in_bucket")
|
||||
.map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
|
||||
.transpose()?,
|
||||
endpoint: toml
|
||||
.get("endpoint")
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(Utf8PathBuf::from(
|
||||
parse_toml_string("local_path", local_path)?,
|
||||
)),
|
||||
(Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
|
||||
(None, Some(bucket_name), Some(bucket_region), ..) => {
|
||||
RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
||||
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
||||
prefix_in_bucket: toml
|
||||
.get("prefix_in_bucket")
|
||||
.map(|prefix_in_bucket| {
|
||||
parse_toml_string("prefix_in_bucket", prefix_in_bucket)
|
||||
})
|
||||
.transpose()?,
|
||||
endpoint,
|
||||
concurrency_limit,
|
||||
max_keys_per_list_response,
|
||||
})
|
||||
}
|
||||
(_, _, _, Some(_), None) => {
|
||||
bail!("'container_name' option is mandatory if 'container_region' is given ")
|
||||
}
|
||||
(_, _, _, None, Some(_)) => {
|
||||
bail!("'container_name' option is mandatory if 'container_region' is given ")
|
||||
}
|
||||
(None, None, None, Some(container_name), Some(container_region)) => {
|
||||
RemoteStorageKind::AzureContainer(AzureConfig {
|
||||
container_name: parse_toml_string("container_name", container_name)?,
|
||||
container_region: parse_toml_string("container_region", container_region)?,
|
||||
prefix_in_container: toml
|
||||
.get("prefix_in_container")
|
||||
.map(|prefix_in_container| {
|
||||
parse_toml_string("prefix_in_container", prefix_in_container)
|
||||
})
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
max_keys_per_list_response,
|
||||
})
|
||||
}
|
||||
(Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
|
||||
Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
|
||||
),
|
||||
(Some(_), Some(_), ..) => {
|
||||
bail!("'local_path' and 'bucket_name' are mutually exclusive")
|
||||
}
|
||||
(Some(_), _, _, Some(_), Some(_)) => {
|
||||
bail!("local_path and 'container_name' are mutually exclusive")
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(RemoteStorageConfig {
|
||||
@@ -513,6 +666,46 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
|
||||
Ok(s.to_string())
|
||||
}
|
||||
|
||||
struct ConcurrencyLimiter {
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
write: Arc<Semaphore>,
|
||||
read: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
impl ConcurrencyLimiter {
|
||||
fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
|
||||
match kind {
|
||||
RequestKind::Get => &self.read,
|
||||
RequestKind::Put => &self.write,
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
async fn acquire(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
|
||||
self.for_kind(kind).acquire().await
|
||||
}
|
||||
|
||||
async fn acquire_owned(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
|
||||
Arc::clone(self.for_kind(kind)).acquire_owned().await
|
||||
}
|
||||
|
||||
fn new(limit: usize) -> ConcurrencyLimiter {
|
||||
Self {
|
||||
read: Arc::new(Semaphore::new(limit)),
|
||||
write: Arc::new(Semaphore::new(limit)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -15,7 +15,7 @@ use tokio::{
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
|
||||
use crate::{Download, DownloadError, RemotePath};
|
||||
use crate::{Download, DownloadError, Listing, ListingMode, RemotePath};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
|
||||
@@ -75,7 +75,7 @@ impl LocalFs {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
.await?
|
||||
.into_iter()
|
||||
@@ -89,52 +89,10 @@ impl LocalFs {
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
prefixes.push(
|
||||
prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
Ok(prefixes)
|
||||
}
|
||||
|
||||
// recursively lists all files in a directory,
|
||||
// mirroring the `list_files` for `s3_bucket`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
async fn list_recursive(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let full_path = match folder {
|
||||
Some(folder) => folder.with_base(&self.storage_root),
|
||||
None => self.storage_root.clone(),
|
||||
@@ -186,6 +144,70 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
Ok(files)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
@@ -479,7 +501,7 @@ mod fs_tests {
|
||||
|
||||
let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
|
||||
assert_eq!(
|
||||
storage.list().await?,
|
||||
storage.list_all().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
@@ -667,7 +689,7 @@ mod fs_tests {
|
||||
let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
assert!(storage.list_all().await?.is_empty());
|
||||
|
||||
storage
|
||||
.delete(&upload_target)
|
||||
@@ -725,6 +747,43 @@ mod fs_tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list() -> anyhow::Result<()> {
|
||||
// No delimiter: should recursively list everything
|
||||
let storage = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
|
||||
|
||||
let listing = storage.list(None, ListingMode::NoDelimiter).await?;
|
||||
assert!(listing.prefixes.is_empty());
|
||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||
|
||||
// Delimiter: should only go one deep
|
||||
let listing = storage.list(None, ListingMode::WithDelimiter).await?;
|
||||
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("timelines").unwrap()].to_vec()
|
||||
);
|
||||
assert!(listing.keys.is_empty());
|
||||
|
||||
// Delimiter & prefix
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
||||
.to_vec()
|
||||
);
|
||||
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
@@ -777,7 +836,7 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut files = storage.list().await?;
|
||||
let mut files = storage.list_all().await?;
|
||||
files.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//! allowing multiple api users to independently work with the same S3 bucket, if
|
||||
//! their bucket prefixes are both specified and different.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::borrow::Cow;
|
||||
|
||||
use anyhow::Context;
|
||||
use aws_config::{
|
||||
@@ -24,22 +24,20 @@ use aws_sdk_s3::{
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
use hyper::Body;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio::{
|
||||
io::{self, AsyncRead},
|
||||
sync::Semaphore,
|
||||
};
|
||||
use tokio::io::{self, AsyncRead};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use super::StorageMetadata;
|
||||
use crate::{
|
||||
Download, DownloadError, RemotePath, RemoteStorage, S3Config, MAX_KEYS_PER_DELETE,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage,
|
||||
S3Config, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
pub(super) mod metrics;
|
||||
|
||||
use self::metrics::{AttemptOutcome, RequestKind};
|
||||
use self::metrics::AttemptOutcome;
|
||||
pub(super) use self::metrics::RequestKind;
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
@@ -50,46 +48,6 @@ pub struct S3Bucket {
|
||||
concurrency_limiter: ConcurrencyLimiter,
|
||||
}
|
||||
|
||||
struct ConcurrencyLimiter {
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
write: Arc<Semaphore>,
|
||||
read: Arc<Semaphore>,
|
||||
}
|
||||
|
||||
impl ConcurrencyLimiter {
|
||||
fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
|
||||
match kind {
|
||||
RequestKind::Get => &self.read,
|
||||
RequestKind::Put => &self.write,
|
||||
RequestKind::List => &self.read,
|
||||
RequestKind::Delete => &self.write,
|
||||
}
|
||||
}
|
||||
|
||||
async fn acquire(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
|
||||
self.for_kind(kind).acquire().await
|
||||
}
|
||||
|
||||
async fn acquire_owned(
|
||||
&self,
|
||||
kind: RequestKind,
|
||||
) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
|
||||
Arc::clone(self.for_kind(kind)).acquire_owned().await
|
||||
}
|
||||
|
||||
fn new(limit: usize) -> ConcurrencyLimiter {
|
||||
Self {
|
||||
read: Arc::new(Semaphore::new(limit)),
|
||||
write: Arc::new(Semaphore::new(limit)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct GetObjectRequest {
|
||||
bucket: String,
|
||||
@@ -341,13 +299,13 @@ impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3Bucket {
|
||||
/// See the doc for `RemoteStorage::list_prefixes`
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
let kind = RequestKind::List;
|
||||
let mut result = Listing::default();
|
||||
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
@@ -356,28 +314,33 @@ impl RemoteStorage for S3Bucket {
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
});
|
||||
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let fetch_response = self
|
||||
let mut request = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.set_max_keys(self.max_keys_per_list_response);
|
||||
|
||||
if let ListingMode::WithDelimiter = mode {
|
||||
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
|
||||
}
|
||||
|
||||
let response = request
|
||||
.send()
|
||||
.await
|
||||
.context("Failed to list S3 prefixes")
|
||||
@@ -387,71 +350,35 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &fetch_response, started_at);
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let fetch_response = fetch_response?;
|
||||
let response = response?;
|
||||
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
.common_prefixes
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
let keys = response.contents().unwrap_or_default();
|
||||
let empty = Vec::new();
|
||||
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
|
||||
|
||||
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
|
||||
|
||||
for object in keys {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
result.keys.push(remote_path);
|
||||
}
|
||||
|
||||
result.prefixes.extend(
|
||||
prefixes
|
||||
.iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
continuation_token = match fetch_response.next_continuation_token {
|
||||
continuation_token = match response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let kind = RequestKind::List;
|
||||
|
||||
let folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self.permit(kind).await;
|
||||
let started_at = start_measuring_requests(kind);
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(folder_name.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.context("Failed to list files in S3 bucket");
|
||||
|
||||
let started_at = ScopeGuard::into_inner(started_at);
|
||||
metrics::BUCKET_METRICS
|
||||
.req_seconds
|
||||
.observe_elapsed(kind, &response, started_at);
|
||||
|
||||
let response = response?;
|
||||
|
||||
for object in response.contents().unwrap_or_default() {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
all_files.push(remote_path);
|
||||
}
|
||||
match response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(all_files)
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -556,6 +483,20 @@ impl RemoteStorage for S3Bucket {
|
||||
.deleted_objects_total
|
||||
.inc_by(chunk.len() as u64);
|
||||
if let Some(errors) = resp.errors {
|
||||
// Log a bounded number of the errors within the response:
|
||||
// these requests can carry 1000 keys so logging each one
|
||||
// would be too verbose, especially as errors may lead us
|
||||
// to retry repeatedly.
|
||||
const LOG_UP_TO_N_ERRORS: usize = 10;
|
||||
for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
|
||||
tracing::warn!(
|
||||
"DeleteObjects key {} failed: {}: {}",
|
||||
e.key.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.code.as_ref().map(Cow::from).unwrap_or("".into()),
|
||||
e.message.as_ref().map(Cow::from).unwrap_or("".into())
|
||||
);
|
||||
}
|
||||
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
|
||||
@@ -6,7 +6,7 @@ use once_cell::sync::Lazy;
|
||||
pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub(super) enum RequestKind {
|
||||
pub(crate) enum RequestKind {
|
||||
Get = 0,
|
||||
Put = 1,
|
||||
Delete = 2,
|
||||
|
||||
@@ -5,7 +5,9 @@ use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
|
||||
use crate::{Download, DownloadError, RemotePath, RemoteStorage, StorageMetadata};
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
|
||||
};
|
||||
|
||||
pub struct UnreliableWrapper {
|
||||
inner: crate::GenericRemoteStorage,
|
||||
@@ -95,6 +97,15 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list_files(folder).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
mode: ListingMode,
|
||||
) -> Result<Listing, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))?;
|
||||
self.inner.list(prefix, mode).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
|
||||
625
libs/remote_storage/tests/test_real_azure.rs
Normal file
625
libs/remote_storage/tests/test_real_azure.rs
Normal file
@@ -0,0 +1,625 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use once_cell::sync::OnceCell;
|
||||
use remote_storage::{
|
||||
AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
};
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
static LOGGING_DONE: OnceCell<()> = OnceCell::new();
|
||||
|
||||
const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";
|
||||
|
||||
const BASE_PREFIX: &str = "test";
|
||||
|
||||
/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
|
||||
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
|
||||
/// See the client creation in [`create_azure_client`] for details on the required env vars.
|
||||
/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_azure_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
///
|
||||
/// Then, verifies that the client does return correct prefixes when queried:
|
||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||
///
|
||||
/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
|
||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
#[test_context(MaybeEnabledAzureWithTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn azure_pagination_should_work(
|
||||
ctx: &mut MaybeEnabledAzureWithTestBlobs,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("Azure init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
.difference(&expected_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||
.difference(&nested_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
|
||||
/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `Azure_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_azure_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn azure_list_files_works(
|
||||
ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("Azure init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
.remote_blobs
|
||||
.iter()
|
||||
.map(|x| x.get_path())
|
||||
.filter(|x| x.starts_with("folder1"))
|
||||
.map(|x| RemotePath::new(x).expect("must be valid path"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledAzure)]
|
||||
#[tokio::test]
|
||||
async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzure::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzure::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(
|
||||
format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
|
||||
))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
ctx.client.delete(&path).await.expect("should succeed");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledAzure)]
|
||||
#[tokio::test]
|
||||
async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledAzure::Enabled(ctx) => ctx,
|
||||
MaybeEnabledAzure::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
let data3 = "remote blob data3".as_bytes();
|
||||
let data3_len = data3.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data3), data3_len, &path3, None)
|
||||
.await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
let prefixes = ctx.client.list_prefixes(None).await?;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
ctx.client.delete_objects(&[path3]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledAzure)]
|
||||
#[tokio::test]
|
||||
async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
|
||||
let MaybeEnabledAzure::Enabled(ctx) = ctx else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data = "remote blob data here".as_bytes();
|
||||
let data_len = data.len() as u64;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data), data.len(), &path, None)
|
||||
.await?;
|
||||
|
||||
async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
|
||||
let mut buf = Vec::new();
|
||||
tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
// Normal download request
|
||||
let dl = ctx.client.download(&path).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
|
||||
// Full range (end specified)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 0, Some(data_len))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
|
||||
// partial range (end specified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..10]);
|
||||
|
||||
// partial range (end beyond real end)
|
||||
let dl = ctx
|
||||
.client
|
||||
.download_byte_range(&path, 8, Some(data_len * 100))
|
||||
.await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[8..]);
|
||||
|
||||
// Partial range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 4, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data[4..]);
|
||||
|
||||
// Full range (end unspecified)
|
||||
let dl = ctx.client.download_byte_range(&path, 0, None).await?;
|
||||
let buf = download_and_compare(dl).await?;
|
||||
assert_eq!(buf, data);
|
||||
|
||||
debug!("Cleanup: deleting file at path {path:?}");
|
||||
ctx.client
|
||||
.delete(&path)
|
||||
.await
|
||||
.with_context(|| format!("{path:?} removal"))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
});
|
||||
}
|
||||
|
||||
struct EnabledAzure {
|
||||
client: Arc<GenericRemoteStorage>,
|
||||
base_prefix: &'static str,
|
||||
}
|
||||
|
||||
impl EnabledAzure {
|
||||
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
|
||||
let client = create_azure_client(max_keys_in_list_response)
|
||||
.context("Azure client creation")
|
||||
.expect("Azure client creation failed");
|
||||
|
||||
EnabledAzure {
|
||||
client,
|
||||
base_prefix: BASE_PREFIX,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledAzure {
|
||||
Enabled(EnabledAzure),
|
||||
Disabled,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledAzure {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
|
||||
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
Self::Enabled(EnabledAzure::setup(None).await)
|
||||
}
|
||||
}
|
||||
|
||||
enum MaybeEnabledAzureWithTestBlobs {
|
||||
Enabled(AzureWithTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, AzureWithTestBlobs),
|
||||
}
|
||||
|
||||
struct AzureWithTestBlobs {
|
||||
enabled: EnabledAzure,
|
||||
remote_prefixes: HashSet<RemotePath>,
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
let max_keys_in_list_response = 10;
|
||||
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_azure_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
Self::Enabled(AzureWithTestBlobs {
|
||||
enabled,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
})
|
||||
}
|
||||
ControlFlow::Break(uploads) => Self::UploadsFailed(
|
||||
anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
|
||||
AzureWithTestBlobs {
|
||||
enabled,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn teardown(self) {
|
||||
match self {
|
||||
Self::Disabled => {}
|
||||
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
|
||||
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledAzureWithSimpleTestBlobs {
|
||||
Enabled(AzureWithSimpleTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
|
||||
}
|
||||
struct AzureWithSimpleTestBlobs {
|
||||
enabled: EnabledAzure,
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
let max_keys_in_list_response = 10;
|
||||
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
|
||||
|
||||
let enabled = EnabledAzure::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_azure_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
Self::Enabled(AzureWithSimpleTestBlobs {
|
||||
enabled,
|
||||
remote_blobs: uploads,
|
||||
})
|
||||
}
|
||||
ControlFlow::Break(uploads) => Self::UploadsFailed(
|
||||
anyhow::anyhow!("One or multiple blobs failed to upload to Azure"),
|
||||
AzureWithSimpleTestBlobs {
|
||||
enabled,
|
||||
remote_blobs: uploads,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn teardown(self) {
|
||||
match self {
|
||||
Self::Disabled => {}
|
||||
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
|
||||
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_azure_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
use rand::Rng;
|
||||
|
||||
let remote_storage_azure_container = env::var("REMOTE_STORAGE_AZURE_CONTAINER").context(
|
||||
"`REMOTE_STORAGE_AZURE_CONTAINER` env var is not set, but real Azure tests are enabled",
|
||||
)?;
|
||||
let remote_storage_azure_region = env::var("REMOTE_STORAGE_AZURE_REGION").context(
|
||||
"`REMOTE_STORAGE_AZURE_REGION` env var is not set, but real Azure tests are enabled",
|
||||
)?;
|
||||
|
||||
// due to how time works, we've had test runners use the same nanos as bucket prefixes.
|
||||
// millis is just a debugging aid for easier finding the prefix later.
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.context("random Azure test prefix part calculation")?
|
||||
.as_millis();
|
||||
|
||||
// because nanos can be the same for two threads so can millis, add randomness
|
||||
let random = rand::thread_rng().gen::<u32>();
|
||||
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
||||
storage: RemoteStorageKind::AzureContainer(AzureConfig {
|
||||
container_name: remote_storage_azure_container,
|
||||
container_region: remote_storage_azure_region,
|
||||
prefix_in_container: Some(format!("test_{millis}_{random:08x}/")),
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
|
||||
let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_azure_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} Azure files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(
|
||||
Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
|
||||
)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
@@ -1,23 +1,18 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use utils::{
|
||||
id::{NodeId, TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TimelineCreateRequest {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub timeline_id: TimelineId,
|
||||
pub peer_ids: Option<Vec<NodeId>>,
|
||||
pub pg_version: u32,
|
||||
pub system_id: Option<u64>,
|
||||
pub wal_seg_size: Option<u32>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub commit_lsn: Lsn,
|
||||
// If not passed, it is assigned to the beginning of commit_lsn segment.
|
||||
pub local_start_lsn: Option<Lsn>,
|
||||
@@ -28,7 +23,6 @@ fn lsn_invalid() -> Lsn {
|
||||
}
|
||||
|
||||
/// Data about safekeeper's timeline, mirrors broker.proto.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term.
|
||||
@@ -36,25 +30,19 @@ pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub flush_lsn: Lsn,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN up to which safekeeper has backed WAL.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub backup_lsn: Lsn,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
#[serde(default = "lsn_invalid")]
|
||||
pub local_start_lsn: Lsn,
|
||||
/// A connection string to use for WAL receiving.
|
||||
|
||||
@@ -55,6 +55,7 @@ bytes.workspace = true
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
camino-tempfile.workspace = true
|
||||
serde_assert.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
@@ -9,7 +9,6 @@ use jsonwebtoken::{
|
||||
decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
|
||||
};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
use crate::id::TenantId;
|
||||
|
||||
@@ -32,11 +31,9 @@ pub enum Scope {
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
|
||||
pub struct Claims {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub tenant_id: Option<TenantId>,
|
||||
pub scope: Scope,
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
||||
///
|
||||
/// See docs/rfcs/025-generation-numbers.md for detail on how generation
|
||||
/// numbers are used.
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||
pub enum Generation {
|
||||
// Generations with this magic value will not add a suffix to S3 keys, and will not
|
||||
// be included in persisted index_part.json. This value is only to be used
|
||||
|
||||
41
libs/utils/src/hex.rs
Normal file
41
libs/utils/src/hex.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
/// Useful type for asserting that expected bytes match reporting the bytes more readable
|
||||
/// array-syntax compatible hex bytes.
|
||||
///
|
||||
/// # Usage
|
||||
///
|
||||
/// ```
|
||||
/// use utils::Hex;
|
||||
///
|
||||
/// let actual = serialize_something();
|
||||
/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
|
||||
///
|
||||
/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
|
||||
/// // output suffixed with an array style length for easier comparisons.
|
||||
/// assert_eq!(Hex(&actual), Hex(&expected));
|
||||
///
|
||||
/// // with `let expected = [0x68];` the error would had been:
|
||||
/// // assertion `left == right` failed
|
||||
/// // left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
|
||||
/// // right: [0x68; 1]
|
||||
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
|
||||
/// ```
|
||||
#[derive(PartialEq)]
|
||||
pub struct Hex<'a>(pub &'a [u8]);
|
||||
|
||||
impl std::fmt::Debug for Hex<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "[")?;
|
||||
for (i, c) in self.0.chunks(16).enumerate() {
|
||||
if i > 0 && !c.is_empty() {
|
||||
writeln!(f, ", ")?;
|
||||
}
|
||||
for (j, b) in c.iter().enumerate() {
|
||||
if j > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "0x{b:02x}")?;
|
||||
}
|
||||
}
|
||||
write!(f, "; {}]", self.0.len())
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,11 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
use std::future::Future;
|
||||
use std::str::FromStr;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"libmetrics_metric_handler_requests_total",
|
||||
@@ -146,94 +151,89 @@ impl Drop for RequestCancelled {
|
||||
}
|
||||
}
|
||||
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
pub struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
pub fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use std::io::Write as _;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio_stream::wrappers::ReceiverStream;
|
||||
|
||||
SERVE_METRICS_COUNT.inc();
|
||||
|
||||
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
|
||||
struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
|
||||
assert_ne!(buf_len, 0);
|
||||
ChannelWriter {
|
||||
// split about half off the buffer from the start, because we flush depending on
|
||||
// capacity. first flush will come sooner than without this, but now resizes will
|
||||
// have better chance of picking up the "other" half. not guaranteed of course.
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
|
||||
fn flush0(&mut self) -> std::io::Result<usize> {
|
||||
let n = self.buffer.len();
|
||||
if n == 0 {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
|
||||
|
||||
// throttle sending to allow reuse of our buffer in `write`.
|
||||
self.tx.reserve().await.map_err(|_| ())?;
|
||||
|
||||
// now the response task has picked up the buffer and hopefully started
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
self.written += n;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
|
||||
let remaining = self.buffer.capacity() - self.buffer.len();
|
||||
|
||||
let out_of_space = remaining < buf.len();
|
||||
|
||||
let original_len = buf.len();
|
||||
|
||||
if out_of_space {
|
||||
let can_still_fit = buf.len() - remaining;
|
||||
self.buffer.extend_from_slice(&buf[..can_still_fit]);
|
||||
buf = &buf[can_still_fit..];
|
||||
self.flush0()?;
|
||||
}
|
||||
|
||||
// assume that this will often under normal operation just move the pointer back to the
|
||||
// beginning of allocation, because previous split off parts are already sent and
|
||||
// dropped.
|
||||
self.buffer.extend_from_slice(buf);
|
||||
Ok(original_len)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> std::io::Result<()> {
|
||||
self.flush0().map(|_| ())
|
||||
}
|
||||
}
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let (tx, rx) = mpsc::channel(1);
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use hyper::{header, Body, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Cow;
|
||||
use std::error::Error as StdError;
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
use tracing::{error, info};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ApiError {
|
||||
@@ -25,7 +26,7 @@ pub enum ApiError {
|
||||
PreconditionFailed(Box<str>),
|
||||
|
||||
#[error("Resource temporarily unavailable: {0}")]
|
||||
ResourceUnavailable(String),
|
||||
ResourceUnavailable(Cow<'static, str>),
|
||||
|
||||
#[error("Shutting down")]
|
||||
ShuttingDown,
|
||||
@@ -115,10 +116,12 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {
|
||||
|
||||
pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
|
||||
// Print a stack trace for Internal Server errors
|
||||
if let ApiError::InternalServerError(_) = api_error {
|
||||
error!("Error processing HTTP request: {api_error:?}");
|
||||
} else {
|
||||
error!("Error processing HTTP request: {api_error:#}");
|
||||
|
||||
match api_error {
|
||||
ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
|
||||
ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
|
||||
_ => error!("Error processing HTTP request: {api_error:#}"),
|
||||
}
|
||||
|
||||
api_error.into_response()
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::{fmt, str::FromStr};
|
||||
use anyhow::Context;
|
||||
use hex::FromHex;
|
||||
use rand::Rng;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
@@ -17,12 +18,74 @@ pub enum IdError {
|
||||
///
|
||||
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
|
||||
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
|
||||
///
|
||||
/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
|
||||
/// Check the `serde_with::serde_as` documentation for options for more complex types.
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
|
||||
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
||||
struct Id([u8; 16]);
|
||||
|
||||
impl Serialize for Id {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
self.0.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Id {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct IdVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> Visitor<'de> for IdVisitor {
|
||||
type Value = Id;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str("value in form of hex string")
|
||||
} else {
|
||||
formatter.write_str("value in form of integer array([u8; 16])")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
|
||||
where
|
||||
A: serde::de::SeqAccess<'de>,
|
||||
{
|
||||
let s = serde::de::value::SeqAccessDeserializer::new(seq);
|
||||
let id: [u8; 16] = Deserialize::deserialize(s)?;
|
||||
Ok(Id::from(id))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Id::from_str(v).map_err(E::custom)
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(IdVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_tuple(
|
||||
16,
|
||||
IdVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Id {
|
||||
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
|
||||
let mut arr = [0u8; 16];
|
||||
@@ -308,3 +371,112 @@ impl fmt::Display for NodeId {
|
||||
write!(f, "{}", self.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_id_serde_non_human_readable() {
|
||||
let original_id = Id([
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
]);
|
||||
let expected_tokens = Tokens(vec![
|
||||
Token::Tuple { len: 16 },
|
||||
Token::U8(173),
|
||||
Token::U8(80),
|
||||
Token::U8(132),
|
||||
Token::U8(115),
|
||||
Token::U8(129),
|
||||
Token::U8(226),
|
||||
Token::U8(72),
|
||||
Token::U8(254),
|
||||
Token::U8(170),
|
||||
Token::U8(201),
|
||||
Token::U8(135),
|
||||
Token::U8(108),
|
||||
Token::U8(199),
|
||||
Token::U8(26),
|
||||
Token::U8(228),
|
||||
Token::U8(24),
|
||||
Token::TupleEnd,
|
||||
]);
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let serialized_tokens = original_id.serialize(&serializer).unwrap();
|
||||
assert_eq!(serialized_tokens, expected_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(serialized_tokens)
|
||||
.build();
|
||||
let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(deserialized_id, original_id);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_id_serde_human_readable() {
|
||||
let original_id = Id([
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
]);
|
||||
let expected_tokens = Tokens(vec![Token::Str(String::from(
|
||||
"ad50847381e248feaac9876cc71ae418",
|
||||
))]);
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(true).build();
|
||||
let serialized_tokens = original_id.serialize(&serializer).unwrap();
|
||||
assert_eq!(serialized_tokens, expected_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(true)
|
||||
.tokens(Tokens(vec![Token::Str(String::from(
|
||||
"ad50847381e248feaac9876cc71ae418",
|
||||
))]))
|
||||
.build();
|
||||
assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
|
||||
}
|
||||
|
||||
macro_rules! roundtrip_type {
|
||||
($type:ty, $expected_bytes:expr) => {{
|
||||
let expected_bytes: [u8; 16] = $expected_bytes;
|
||||
let original_id = <$type>::from(expected_bytes);
|
||||
|
||||
let ser_bytes = original_id.ser().unwrap();
|
||||
assert_eq!(ser_bytes, expected_bytes);
|
||||
|
||||
let des_id = <$type>::des(&ser_bytes).unwrap();
|
||||
assert_eq!(des_id, original_id);
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_id_bincode_serde() {
|
||||
let expected_bytes = [
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
];
|
||||
|
||||
roundtrip_type!(Id, expected_bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tenant_id_bincode_serde() {
|
||||
let expected_bytes = [
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
];
|
||||
|
||||
roundtrip_type!(TenantId, expected_bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_timeline_id_bincode_serde() {
|
||||
let expected_bytes = [
|
||||
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
|
||||
];
|
||||
|
||||
roundtrip_type!(TimelineId, expected_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,6 +24,10 @@ pub mod auth;
|
||||
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod id;
|
||||
|
||||
mod hex;
|
||||
pub use hex::Hex;
|
||||
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
@@ -73,6 +77,11 @@ pub mod completion;
|
||||
/// Reporting utilities
|
||||
pub mod error;
|
||||
|
||||
/// async timeout helper
|
||||
pub mod timeout;
|
||||
|
||||
pub mod sync;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
@@ -128,6 +137,21 @@ macro_rules! project_git_version {
|
||||
};
|
||||
}
|
||||
|
||||
/// This is a shortcut to embed build tag into binaries and avoid copying the same build script to all packages
|
||||
#[macro_export]
|
||||
macro_rules! project_build_tag {
|
||||
($const_identifier:ident) => {
|
||||
const $const_identifier: &::core::primitive::str = {
|
||||
const __ARG: &[&::core::primitive::str; 2] = &match ::core::option_env!("BUILD_TAG") {
|
||||
::core::option::Option::Some(x) => ["build_tag-env:", x],
|
||||
::core::option::Option::None => ["build_tag:", ""],
|
||||
};
|
||||
|
||||
$crate::__const_format::concatcp!(__ARG[0], __ARG[1])
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
/// Re-export for `project_git_version` macro
|
||||
#[doc(hidden)]
|
||||
pub use const_format as __const_format;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use camino::Utf8Path;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::{de::Visitor, Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::ops::{Add, AddAssign};
|
||||
use std::str::FromStr;
|
||||
@@ -13,10 +13,114 @@ use crate::seqwait::MonotonicCounter;
|
||||
pub const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
|
||||
pub struct Lsn(pub u64);
|
||||
|
||||
impl Serialize for Lsn {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
self.0.serialize(serializer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Lsn {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||
where
|
||||
D: serde::Deserializer<'de>,
|
||||
{
|
||||
struct LsnVisitor {
|
||||
is_human_readable_deserializer: bool,
|
||||
}
|
||||
|
||||
impl<'de> Visitor<'de> for LsnVisitor {
|
||||
type Value = Lsn;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.is_human_readable_deserializer {
|
||||
formatter.write_str(
|
||||
"value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
|
||||
)
|
||||
} else {
|
||||
formatter.write_str("value in form of integer(u64)")
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Ok(Lsn(v))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Lsn::from_str(v).map_err(|e| E::custom(e))
|
||||
}
|
||||
}
|
||||
|
||||
if deserializer.is_human_readable() {
|
||||
deserializer.deserialize_str(LsnVisitor {
|
||||
is_human_readable_deserializer: true,
|
||||
})
|
||||
} else {
|
||||
deserializer.deserialize_u64(LsnVisitor {
|
||||
is_human_readable_deserializer: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Allows (de)serialization of an `Lsn` always as `u64`.
|
||||
///
|
||||
/// ### Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # use serde::{Serialize, Deserialize};
|
||||
/// use utils::lsn::Lsn;
|
||||
///
|
||||
/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
|
||||
/// struct Foo {
|
||||
/// #[serde(with = "utils::lsn::serde_as_u64")]
|
||||
/// always_u64: Lsn,
|
||||
/// }
|
||||
///
|
||||
/// let orig = Foo { always_u64: Lsn(1234) };
|
||||
///
|
||||
/// let res = serde_json::to_string(&orig).unwrap();
|
||||
/// assert_eq!(res, r#"{"always_u64":1234}"#);
|
||||
///
|
||||
/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
|
||||
/// assert_eq!(foo, orig);
|
||||
/// ```
|
||||
///
|
||||
pub mod serde_as_u64 {
|
||||
use super::Lsn;
|
||||
|
||||
/// Serializes the Lsn as u64 disregarding the human readability of the format.
|
||||
///
|
||||
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
|
||||
pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
use serde::Serialize;
|
||||
lsn.0.serialize(serializer)
|
||||
}
|
||||
|
||||
/// Deserializes the Lsn as u64 disregarding the human readability of the format.
|
||||
///
|
||||
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
|
||||
pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
|
||||
use serde::Deserialize;
|
||||
u64::deserialize(deserializer).map(Lsn)
|
||||
}
|
||||
}
|
||||
|
||||
/// We tried to parse an LSN from a string, but failed
|
||||
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
|
||||
#[error("LsnParseError")]
|
||||
@@ -264,8 +368,13 @@ impl MonotonicCounter<Lsn> for RecordLsn {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bin_ser::BeSer;
|
||||
|
||||
use super::*;
|
||||
|
||||
use serde::ser::Serialize;
|
||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||
|
||||
#[test]
|
||||
fn test_lsn_strings() {
|
||||
assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
|
||||
@@ -341,4 +450,95 @@ mod tests {
|
||||
assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
|
||||
assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_serde() {
|
||||
let original_lsn = Lsn(0x0123456789abcdef);
|
||||
let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
|
||||
let expected_non_readable_tokens =
|
||||
Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
|
||||
|
||||
// Testing human_readable ser/de
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
assert_eq!(readable_ser_tokens, expected_readable_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(readable_ser_tokens)
|
||||
.build();
|
||||
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
|
||||
// Testing NON human_readable ser/de
|
||||
let serializer = Serializer::builder().is_human_readable(true).build();
|
||||
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(true)
|
||||
.tokens(non_readable_ser_tokens)
|
||||
.build();
|
||||
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
|
||||
// Testing mismatching ser/de
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(true)
|
||||
.tokens(non_readable_ser_tokens)
|
||||
.build();
|
||||
Lsn::deserialize(&mut deserializer).unwrap_err();
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(true).build();
|
||||
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(readable_ser_tokens)
|
||||
.build();
|
||||
Lsn::deserialize(&mut deserializer).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_ensure_roundtrip() {
|
||||
let original_lsn = Lsn(0xaaaabbbb);
|
||||
|
||||
let serializer = Serializer::builder().is_human_readable(false).build();
|
||||
let ser_tokens = original_lsn.serialize(&serializer).unwrap();
|
||||
|
||||
let mut deserializer = Deserializer::builder()
|
||||
.is_human_readable(false)
|
||||
.tokens(ser_tokens)
|
||||
.build();
|
||||
|
||||
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_bincode_serde() {
|
||||
let lsn = Lsn(0x0123456789abcdef);
|
||||
let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
|
||||
|
||||
let ser_bytes = lsn.ser().unwrap();
|
||||
assert_eq!(ser_bytes, expected_bytes);
|
||||
|
||||
let des_lsn = Lsn::des(&ser_bytes).unwrap();
|
||||
assert_eq!(des_lsn, lsn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lsn_bincode_ensure_roundtrip() {
|
||||
let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
|
||||
let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
|
||||
|
||||
let ser_bytes = original_lsn.ser().unwrap();
|
||||
assert_eq!(ser_bytes, expected_bytes);
|
||||
|
||||
let des_lsn = Lsn::des(&ser_bytes).unwrap();
|
||||
assert_eq!(des_lsn, original_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::time::{Duration, SystemTime};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use pq_proto::{read_cstr, PG_EPOCH};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tracing::{trace, warn};
|
||||
|
||||
use crate::lsn::Lsn;
|
||||
@@ -15,21 +14,17 @@ use crate::lsn::Lsn;
|
||||
///
|
||||
/// serde Serialize is used only for human readable dump to json (e.g. in
|
||||
/// safekeepers debug_dump).
|
||||
#[serde_as]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct PageserverFeedback {
|
||||
/// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub current_timeline_size: u64,
|
||||
/// LSN last received and ingested by the pageserver. Controls backpressure.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub last_received_lsn: Lsn,
|
||||
/// LSN up to which data is persisted by the pageserver to its local disc.
|
||||
/// Controls backpressure.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
|
||||
/// consider WAL before it can be removed.
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
// Serialize with RFC3339 format.
|
||||
#[serde(with = "serde_systemtime")]
|
||||
|
||||
@@ -58,7 +58,7 @@ where
|
||||
// to get that.
|
||||
impl<T: Ord> PartialOrd for Waiter<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
other.wake_num.partial_cmp(&self.wake_num)
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
3
libs/utils/src/sync.rs
Normal file
3
libs/utils/src/sync.rs
Normal file
@@ -0,0 +1,3 @@
|
||||
pub mod heavier_once_cell;
|
||||
|
||||
pub mod gate;
|
||||
151
libs/utils/src/sync/gate.rs
Normal file
151
libs/utils/src/sync/gate.rs
Normal file
@@ -0,0 +1,151 @@
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
|
||||
///
|
||||
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
|
||||
/// the resource calls `close()` when they want to ensure that all holders of guards
|
||||
/// have released them, and that no future guards will be issued.
|
||||
pub struct Gate {
|
||||
/// Each caller of enter() takes one unit from the semaphore. In close(), we
|
||||
/// take all the units to ensure all GateGuards are destroyed.
|
||||
sem: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// For observability only: a name that will be used to log warnings if a particular
|
||||
/// gate is holding up shutdown
|
||||
name: String,
|
||||
}
|
||||
|
||||
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
|
||||
/// not complete.
|
||||
#[derive(Debug)]
|
||||
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
|
||||
async fn warn_if_stuck<Fut: std::future::Future>(
|
||||
fut: Fut,
|
||||
name: &str,
|
||||
warn_period: std::time::Duration,
|
||||
) -> <Fut as std::future::Future>::Output {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
let mut fut = std::pin::pin!(fut);
|
||||
|
||||
loop {
|
||||
match tokio::time::timeout(warn_period, &mut fut).await {
|
||||
Ok(ret) => return ret,
|
||||
Err(_) => {
|
||||
tracing::warn!(
|
||||
gate = name,
|
||||
elapsed_ms = started.elapsed().as_millis(),
|
||||
"still waiting, taking longer than expected..."
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum GateError {
|
||||
GateClosed,
|
||||
}
|
||||
|
||||
impl Gate {
|
||||
const MAX_UNITS: u32 = u32::MAX;
|
||||
|
||||
pub fn new(name: String) -> Self {
|
||||
Self {
|
||||
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
|
||||
name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Acquire a guard that will prevent close() calls from completing. If close()
|
||||
/// was already called, this will return an error which should be interpreted
|
||||
/// as "shutting down".
|
||||
///
|
||||
/// This function would typically be used from e.g. request handlers. While holding
|
||||
/// the guard returned from this function, it is important to respect a CancellationToken
|
||||
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
|
||||
/// also contain a CancellationToken.
|
||||
pub fn enter(&self) -> Result<GateGuard, GateError> {
|
||||
self.sem
|
||||
.clone()
|
||||
.try_acquire_owned()
|
||||
.map(GateGuard)
|
||||
.map_err(|_| GateError::GateClosed)
|
||||
}
|
||||
|
||||
/// Types with a shutdown() method and a gate should call this method at the
|
||||
/// end of shutdown, to ensure that all GateGuard holders are done.
|
||||
///
|
||||
/// This will wait for all guards to be destroyed. For this to complete promptly, it is
|
||||
/// important that the holders of such guards are respecting a CancellationToken which has
|
||||
/// been cancelled before entering this function.
|
||||
pub async fn close(&self) {
|
||||
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
|
||||
}
|
||||
|
||||
async fn do_close(&self) {
|
||||
tracing::debug!(gate = self.name, "Closing Gate...");
|
||||
match self.sem.acquire_many(Self::MAX_UNITS).await {
|
||||
Ok(_units) => {
|
||||
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
|
||||
self.sem.close();
|
||||
}
|
||||
Err(_) => {
|
||||
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
|
||||
// This is legal. Timeline::shutdown for example is not protected from being called more than
|
||||
// once.
|
||||
tracing::debug!(gate = self.name, "Double close")
|
||||
}
|
||||
}
|
||||
tracing::debug!(gate = self.name, "Closed Gate.")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::FutureExt;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_idle_gate() {
|
||||
// Having taken no gates, we should not be blocked in close
|
||||
let gate = Gate::new("test".to_string());
|
||||
gate.close().await;
|
||||
|
||||
// If a guard is dropped before entering, close should not be blocked
|
||||
let gate = Gate::new("test".to_string());
|
||||
let guard = gate.enter().unwrap();
|
||||
drop(guard);
|
||||
gate.close().await;
|
||||
|
||||
// Entering a closed guard fails
|
||||
gate.enter().expect_err("enter should fail after close");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_busy_gate() {
|
||||
let gate = Gate::new("test".to_string());
|
||||
|
||||
let guard = gate.enter().unwrap();
|
||||
|
||||
let mut close_fut = std::pin::pin!(gate.close());
|
||||
|
||||
// Close should be blocked
|
||||
assert!(close_fut.as_mut().now_or_never().is_none());
|
||||
|
||||
// Attempting to enter() should fail, even though close isn't done yet.
|
||||
gate.enter()
|
||||
.expect_err("enter should fail after entering close");
|
||||
|
||||
drop(guard);
|
||||
|
||||
// Guard is gone, close should finish
|
||||
assert!(close_fut.as_mut().now_or_never().is_some());
|
||||
|
||||
// Attempting to enter() is still forbidden
|
||||
gate.enter().expect_err("enter should fail finishing close");
|
||||
}
|
||||
}
|
||||
383
libs/utils/src/sync/heavier_once_cell.rs
Normal file
383
libs/utils/src/sync/heavier_once_cell.rs
Normal file
@@ -0,0 +1,383 @@
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex, MutexGuard,
|
||||
};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
|
||||
/// `SemaphorePermit`, allowing use of `take` which does not require holding an outer mutex guard
|
||||
/// for the duration of initialization.
|
||||
///
|
||||
/// Has no unsafe, builds upon [`tokio::sync::Semaphore`] and [`std::sync::Mutex`].
|
||||
///
|
||||
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
||||
pub struct OnceCell<T> {
|
||||
inner: Mutex<Inner<T>>,
|
||||
initializers: AtomicUsize,
|
||||
}
|
||||
|
||||
impl<T> Default for OnceCell<T> {
|
||||
/// Create new uninitialized [`OnceCell`].
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
inner: Default::default(),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Semaphore is the current state:
|
||||
/// - open semaphore means the value is `None`, not yet initialized
|
||||
/// - closed semaphore means the value has been initialized
|
||||
#[derive(Debug)]
|
||||
struct Inner<T> {
|
||||
init_semaphore: Arc<Semaphore>,
|
||||
value: Option<T>,
|
||||
}
|
||||
|
||||
impl<T> Default for Inner<T> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
init_semaphore: Arc::new(Semaphore::new(1)),
|
||||
value: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> OnceCell<T> {
|
||||
/// Creates an already initialized `OnceCell` with the given value.
|
||||
pub fn new(value: T) -> Self {
|
||||
let sem = Semaphore::new(1);
|
||||
sem.close();
|
||||
Self {
|
||||
inner: Mutex::new(Inner {
|
||||
init_semaphore: Arc::new(sem),
|
||||
value: Some(value),
|
||||
}),
|
||||
initializers: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
|
||||
/// returning the guard.
|
||||
///
|
||||
/// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
|
||||
///
|
||||
/// Initialization is panic-safe and cancellation-safe.
|
||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
|
||||
where
|
||||
F: FnOnce(InitPermit) -> Fut,
|
||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
||||
{
|
||||
let sem = {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire_owned().await
|
||||
};
|
||||
|
||||
match permit {
|
||||
Ok(permit) => {
|
||||
let permit = InitPermit(permit);
|
||||
let (value, _permit) = factory(permit).await?;
|
||||
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
Ok(Self::set0(value, guard))
|
||||
}
|
||||
Err(_closed) => {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the inner has already been initialized.
|
||||
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
|
||||
// give more permits right now.
|
||||
if guard.init_semaphore.try_acquire().is_ok() {
|
||||
drop(guard);
|
||||
panic!("permit is of wrong origin");
|
||||
}
|
||||
|
||||
Self::set0(value, guard)
|
||||
}
|
||||
|
||||
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
|
||||
if guard.value.is_some() {
|
||||
drop(guard);
|
||||
unreachable!("we won permit, must not be initialized");
|
||||
}
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
Guard(guard)
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, if any.
|
||||
pub fn get(&self) -> Option<Guard<'_, T>> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
Some(Guard(guard))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||
pub fn initializer_count(&self) -> usize {
|
||||
self.initializers.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
|
||||
/// initializing task for example at the end of initialization.
|
||||
struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
|
||||
|
||||
impl<'a, T> CountWaitingInitializers<'a, T> {
|
||||
fn start(target: &'a OnceCell<T>) -> Self {
|
||||
target.initializers.fetch_add(1, Ordering::Relaxed);
|
||||
CountWaitingInitializers(target)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
self.0.initializers.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
||||
/// initialized value.
|
||||
#[derive(Debug)]
|
||||
pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
|
||||
|
||||
impl<T> std::ops::Deref for Guard<'_, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0
|
||||
.value
|
||||
.as_ref()
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> std::ops::DerefMut for Guard<'_, T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.0
|
||||
.value
|
||||
.as_mut()
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Guard<'a, T> {
|
||||
/// Take the current value, and a new permit for it's deinitialization.
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
|
||||
let mut swapped = Inner::default();
|
||||
let permit = swapped
|
||||
.init_semaphore
|
||||
.clone()
|
||||
.try_acquire_owned()
|
||||
.expect("we just created this");
|
||||
std::mem::swap(&mut *self.0, &mut swapped);
|
||||
swapped
|
||||
.value
|
||||
.map(|v| (v, InitPermit(permit)))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
/// Type held by OnceCell (de)initializing task.
|
||||
pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn many_initializers() {
|
||||
#[derive(Default, Debug)]
|
||||
struct Counters {
|
||||
factory_got_to_run: AtomicUsize,
|
||||
future_polled: AtomicUsize,
|
||||
winners: AtomicUsize,
|
||||
}
|
||||
|
||||
let initializers = 100;
|
||||
|
||||
let cell = Arc::new(OnceCell::default());
|
||||
let counters = Arc::new(Counters::default());
|
||||
let barrier = Arc::new(tokio::sync::Barrier::new(initializers + 1));
|
||||
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
for i in 0..initializers {
|
||||
js.spawn({
|
||||
let cell = cell.clone();
|
||||
let counters = counters.clone();
|
||||
let barrier = barrier.clone();
|
||||
|
||||
async move {
|
||||
barrier.wait().await;
|
||||
let won = {
|
||||
let g = cell
|
||||
.get_or_init(|permit| {
|
||||
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
||||
async {
|
||||
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
||||
Ok::<_, Infallible>((i, permit))
|
||||
}
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
*g == i
|
||||
};
|
||||
|
||||
if won {
|
||||
counters.winners.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
barrier.wait().await;
|
||||
|
||||
while let Some(next) = js.join_next().await {
|
||||
next.expect("no panics expected");
|
||||
}
|
||||
|
||||
let mut counters = Arc::try_unwrap(counters).unwrap();
|
||||
|
||||
assert_eq!(*counters.factory_got_to_run.get_mut(), 1);
|
||||
assert_eq!(*counters.future_polled.get_mut(), 1);
|
||||
assert_eq!(*counters.winners.get_mut(), 1);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn reinit_waits_for_deinit() {
|
||||
// with the tokio::time paused, we will "sleep" for 1s while holding the reinitialization
|
||||
let sleep_for = Duration::from_secs(1);
|
||||
let initial = 42;
|
||||
let reinit = 1;
|
||||
let cell = Arc::new(OnceCell::new(initial));
|
||||
|
||||
let deinitialization_started = Arc::new(tokio::sync::Barrier::new(2));
|
||||
|
||||
let jh = tokio::spawn({
|
||||
let cell = cell.clone();
|
||||
let deinitialization_started = deinitialization_started.clone();
|
||||
async move {
|
||||
let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
|
||||
assert_eq!(answer, initial);
|
||||
|
||||
deinitialization_started.wait().await;
|
||||
tokio::time::sleep(sleep_for).await;
|
||||
}
|
||||
});
|
||||
|
||||
deinitialization_started.wait().await;
|
||||
|
||||
let started_at = tokio::time::Instant::now();
|
||||
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let elapsed = started_at.elapsed();
|
||||
assert!(
|
||||
elapsed >= sleep_for,
|
||||
"initialization should had taken at least the time time slept with permit"
|
||||
);
|
||||
|
||||
jh.await.unwrap();
|
||||
|
||||
assert_eq!(*cell.get().unwrap(), reinit);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reinit_with_deinit_permit() {
|
||||
let cell = Arc::new(OnceCell::new(42));
|
||||
|
||||
let (mol, permit) = cell.get().unwrap().take_and_deinit();
|
||||
cell.set(5, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 5);
|
||||
|
||||
let (five, permit) = cell.get().unwrap().take_and_deinit();
|
||||
assert_eq!(5, five);
|
||||
cell.set(mol, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 42);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn initialization_attemptable_until_ok() {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
for _ in 0..10 {
|
||||
cell.get_or_init(|_permit| async { Err("whatever error") })
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "finally success");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn initialization_is_cancellation_safe() {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
let barrier = tokio::sync::Barrier::new(2);
|
||||
|
||||
let initializer = cell.get_or_init(|permit| async {
|
||||
barrier.wait().await;
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
Ok::<_, Infallible>(("never reached", permit))
|
||||
});
|
||||
|
||||
tokio::select! {
|
||||
_ = initializer => { unreachable!("cannot complete; stuck in pending().await") },
|
||||
_ = barrier.wait() => {}
|
||||
};
|
||||
|
||||
// now initializer is dropped
|
||||
|
||||
assert!(cell.get().is_none());
|
||||
|
||||
let g = cell
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "now initialized");
|
||||
}
|
||||
}
|
||||
37
libs/utils/src/timeout.rs
Normal file
37
libs/utils/src/timeout.rs
Normal file
@@ -0,0 +1,37 @@
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
pub enum TimeoutCancellableError {
|
||||
Timeout,
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
/// Wrap [`tokio::time::timeout`] with a CancellationToken.
|
||||
///
|
||||
/// This wrapper is appropriate for any long running operation in a task
|
||||
/// that ought to respect a CancellationToken (which means most tasks).
|
||||
///
|
||||
/// The only time you should use a bare tokio::timeout is when the future `F`
|
||||
/// itself respects a CancellationToken: otherwise, always use this wrapper
|
||||
/// with your CancellationToken to ensure that your task does not hold up
|
||||
/// graceful shutdown.
|
||||
pub async fn timeout_cancellable<F>(
|
||||
duration: Duration,
|
||||
cancel: &CancellationToken,
|
||||
future: F,
|
||||
) -> Result<F::Output, TimeoutCancellableError>
|
||||
where
|
||||
F: std::future::Future,
|
||||
{
|
||||
tokio::select!(
|
||||
r = tokio::time::timeout(duration, future) => {
|
||||
r.map_err(|_| TimeoutCancellableError::Timeout)
|
||||
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
Err(TimeoutCancellableError::Cancelled)
|
||||
|
||||
}
|
||||
)
|
||||
}
|
||||
@@ -27,8 +27,8 @@ and old one if it exists.
|
||||
* the filecache: a struct that allows communication with the Postgres file cache.
|
||||
On startup, we connect to the filecache and hold on to the connection for the
|
||||
entire monitor lifetime.
|
||||
* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
|
||||
listening for `memory.high` events and setting its `memory.{high,max}` values.
|
||||
* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
|
||||
usage and sends rolling aggregates to the runner.
|
||||
* the runner: the runner marries the filecache and cgroup watcher together,
|
||||
communicating with the agent throught the `Dispatcher`, and then calling filecache
|
||||
and cgroup watcher functions as needed to upscale and downscale
|
||||
|
||||
@@ -1,161 +1,38 @@
|
||||
use std::{
|
||||
fmt::{Debug, Display},
|
||||
fs,
|
||||
pin::pin,
|
||||
sync::atomic::{AtomicU64, Ordering},
|
||||
};
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use anyhow::{anyhow, Context};
|
||||
use cgroups_rs::{
|
||||
freezer::FreezerController,
|
||||
hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
|
||||
hierarchies::{self, is_cgroup2_unified_mode},
|
||||
memory::MemController,
|
||||
MaxValue,
|
||||
Subsystem::{Freezer, Mem},
|
||||
Subsystem,
|
||||
};
|
||||
use inotify::{EventStream, Inotify, WatchMask};
|
||||
use tokio::sync::mpsc::{self, error::TryRecvError};
|
||||
use tokio::time::{Duration, Instant};
|
||||
use tokio_stream::{Stream, StreamExt};
|
||||
use tokio::sync::watch;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::protocol::Resources;
|
||||
use crate::MiB;
|
||||
|
||||
/// Monotonically increasing counter of the number of memory.high events
|
||||
/// the cgroup has experienced.
|
||||
///
|
||||
/// We use this to determine if a modification to the `memory.events` file actually
|
||||
/// changed the `high` field. If not, we don't care about the change. When we
|
||||
/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
|
||||
/// to see if it changed since last time.
|
||||
pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// Monotonically increasing counter that gives each cgroup event a unique id.
|
||||
///
|
||||
/// This allows us to answer questions like "did this upscale arrive before this
|
||||
/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
|
||||
/// with a sequence number. As such, prefer to used the `Sequenced` type rather
|
||||
/// than this static directly.
|
||||
static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
|
||||
|
||||
/// A memory event type reported in memory.events.
|
||||
#[derive(Debug, Eq, PartialEq, Copy, Clone)]
|
||||
pub enum MemoryEvent {
|
||||
Low,
|
||||
High,
|
||||
Max,
|
||||
Oom,
|
||||
OomKill,
|
||||
OomGroupKill,
|
||||
}
|
||||
|
||||
impl MemoryEvent {
|
||||
fn as_str(&self) -> &str {
|
||||
match self {
|
||||
MemoryEvent::Low => "low",
|
||||
MemoryEvent::High => "high",
|
||||
MemoryEvent::Max => "max",
|
||||
MemoryEvent::Oom => "oom",
|
||||
MemoryEvent::OomKill => "oom_kill",
|
||||
MemoryEvent::OomGroupKill => "oom_group_kill",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Display for MemoryEvent {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
/// Configuration for a `CgroupWatcher`
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Config {
|
||||
// The target difference between the total memory reserved for the cgroup
|
||||
// and the value of the cgroup's memory.high.
|
||||
//
|
||||
// In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
|
||||
// use (equal to system memory, minus whatever's taken out for the file cache).
|
||||
oom_buffer_bytes: u64,
|
||||
/// Interval at which we should be fetching memory statistics
|
||||
memory_poll_interval: Duration,
|
||||
|
||||
// The amount of memory, in bytes, below a proposed new value for
|
||||
// memory.high that the cgroup's memory usage must be for us to downscale
|
||||
//
|
||||
// In other words, we can downscale only when:
|
||||
//
|
||||
// memory.current + memory_high_buffer_bytes < (proposed) memory.high
|
||||
//
|
||||
// TODO: there's some minor issues with this approach -- in particular, that we might have
|
||||
// memory in use by the kernel's page cache that we're actually ok with getting rid of.
|
||||
pub(crate) memory_high_buffer_bytes: u64,
|
||||
|
||||
// The maximum duration, in milliseconds, that we're allowed to pause
|
||||
// the cgroup for while waiting for the autoscaler-agent to upscale us
|
||||
max_upscale_wait: Duration,
|
||||
|
||||
// The required minimum time, in milliseconds, that we must wait before re-freezing
|
||||
// the cgroup while waiting for the autoscaler-agent to upscale us.
|
||||
do_not_freeze_more_often_than: Duration,
|
||||
|
||||
// The amount of memory, in bytes, that we should periodically increase memory.high
|
||||
// by while waiting for the autoscaler-agent to upscale us.
|
||||
//
|
||||
// This exists to avoid the excessive throttling that happens when a cgroup is above its
|
||||
// memory.high for too long. See more here:
|
||||
// https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
|
||||
memory_high_increase_by_bytes: u64,
|
||||
|
||||
// The period, in milliseconds, at which we should repeatedly increase the value
|
||||
// of the cgroup's memory.high while we're waiting on upscaling and memory.high
|
||||
// is still being hit.
|
||||
//
|
||||
// Technically speaking, this actually serves as a rate limit to moderate responding to
|
||||
// memory.high events, but these are roughly equivalent if the process is still allocating
|
||||
// memory.
|
||||
memory_high_increase_every: Duration,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Calculate the new value for the cgroups memory.high based on system memory
|
||||
pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
|
||||
total_system_mem.saturating_sub(self.oom_buffer_bytes)
|
||||
}
|
||||
/// The number of samples used in constructing aggregated memory statistics
|
||||
memory_history_len: usize,
|
||||
/// The number of most recent samples that will be periodically logged.
|
||||
///
|
||||
/// Each sample is logged exactly once. Increasing this value means that recent samples will be
|
||||
/// logged less frequently, and vice versa.
|
||||
///
|
||||
/// For simplicity, this value must be greater than or equal to `memory_history_len`.
|
||||
memory_history_log_interval: usize,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
oom_buffer_bytes: 100 * MiB,
|
||||
memory_high_buffer_bytes: 100 * MiB,
|
||||
// while waiting for upscale, don't freeze for more than 20ms every 1s
|
||||
max_upscale_wait: Duration::from_millis(20),
|
||||
do_not_freeze_more_often_than: Duration::from_millis(1000),
|
||||
// while waiting for upscale, increase memory.high by 10MiB every 25ms
|
||||
memory_high_increase_by_bytes: 10 * MiB,
|
||||
memory_high_increase_every: Duration::from_millis(25),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to represent data that is associated with a certain point in time, such
|
||||
/// as an upscale request or memory.high event.
|
||||
///
|
||||
/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
|
||||
/// a unique sequence number. Sequence numbers are monotonically increasing,
|
||||
/// allowing us to answer questions like "did this upscale happen after this
|
||||
/// memory.high event?" by comparing the sequence numbers of the two events.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Sequenced<T> {
|
||||
seqnum: u64,
|
||||
data: T,
|
||||
}
|
||||
|
||||
impl<T> Sequenced<T> {
|
||||
pub fn new(data: T) -> Self {
|
||||
Self {
|
||||
seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
|
||||
data,
|
||||
memory_poll_interval: Duration::from_millis(100),
|
||||
memory_history_len: 5, // use 500ms of history for decision-making
|
||||
memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -170,74 +47,14 @@ impl<T> Sequenced<T> {
|
||||
pub struct CgroupWatcher {
|
||||
pub config: Config,
|
||||
|
||||
/// The sequence number of the last upscale.
|
||||
///
|
||||
/// If we receive a memory.high event that has a _lower_ sequence number than
|
||||
/// `last_upscale_seqnum`, then we know it occured before the upscale, and we
|
||||
/// can safely ignore it.
|
||||
///
|
||||
/// Note: Like the `events` field, this doesn't _need_ interior mutability but we
|
||||
/// use it anyways so that methods take `&self`, not `&mut self`.
|
||||
last_upscale_seqnum: AtomicU64,
|
||||
|
||||
/// A channel on which we send messages to request upscale from the dispatcher.
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
|
||||
/// The actual cgroup we are watching and managing.
|
||||
cgroup: cgroups_rs::Cgroup,
|
||||
}
|
||||
|
||||
/// Read memory.events for the desired event type.
|
||||
///
|
||||
/// `path` specifies the path to the desired `memory.events` file.
|
||||
/// For more info, see the `memory.events` section of the [kernel docs]
|
||||
/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
|
||||
fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
|
||||
let contents = fs::read_to_string(path)
|
||||
.with_context(|| format!("failed to read memory.events from {path}"))?;
|
||||
|
||||
// Then contents of the file look like:
|
||||
// low 42
|
||||
// high 101
|
||||
// ...
|
||||
contents
|
||||
.lines()
|
||||
.filter_map(|s| s.split_once(' '))
|
||||
.find(|(e, _)| *e == event.as_str())
|
||||
.ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
|
||||
.and_then(|(_, count)| {
|
||||
count
|
||||
.parse::<u64>()
|
||||
.with_context(|| format!("failed to parse memory.{event} as u64"))
|
||||
})
|
||||
}
|
||||
|
||||
/// Create an event stream that produces events whenever the file at the provided
|
||||
/// path is modified.
|
||||
fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
|
||||
info!("creating file watcher for {path}");
|
||||
let inotify = Inotify::init().context("failed to initialize file watcher")?;
|
||||
inotify
|
||||
.watches()
|
||||
.add(path, WatchMask::MODIFY)
|
||||
.with_context(|| format!("failed to start watching {path}"))?;
|
||||
inotify
|
||||
// The inotify docs use [0u8; 1024] so we'll just copy them. We only need
|
||||
// to store one event at a time - if the event gets written over, that's
|
||||
// ok. We still see that there is an event. For more information, see:
|
||||
// https://man7.org/linux/man-pages/man7/inotify.7.html
|
||||
.into_event_stream([0u8; 1024])
|
||||
.context("failed to start inotify event stream")
|
||||
}
|
||||
|
||||
impl CgroupWatcher {
|
||||
/// Create a new `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all, fields(%name))]
|
||||
pub fn new(
|
||||
name: String,
|
||||
// A channel on which to send upscale requests
|
||||
upscale_requester: mpsc::Sender<()>,
|
||||
) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
|
||||
pub fn new(name: String) -> anyhow::Result<Self> {
|
||||
// TODO: clarify exactly why we need v2
|
||||
// Make sure cgroups v2 (aka unified) are supported
|
||||
if !is_cgroup2_unified_mode() {
|
||||
@@ -245,410 +62,203 @@ impl CgroupWatcher {
|
||||
}
|
||||
let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
|
||||
|
||||
// Start monitoring the cgroup for memory events. In general, for
|
||||
// cgroups v2 (aka unified), metrics are reported in files like
|
||||
// > `/sys/fs/cgroup/{name}/{metric}`
|
||||
// We are looking for `memory.high` events, which are stored in the
|
||||
// file `memory.events`. For more info, see the `memory.events` section
|
||||
// of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
|
||||
let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
|
||||
let memory_events = create_file_watcher(&path)
|
||||
.with_context(|| format!("failed to create event watcher for {path}"))?
|
||||
// This would be nice with with .inspect_err followed by .ok
|
||||
.filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
|
||||
Ok(high) => Some(high),
|
||||
Err(error) => {
|
||||
// TODO: Might want to just panic here
|
||||
warn!(?error, "failed to read high events count from {}", &path);
|
||||
None
|
||||
}
|
||||
})
|
||||
// Only report the event if the memory.high count increased
|
||||
.filter_map(|high| {
|
||||
if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
|
||||
Some(high)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(Sequenced::new);
|
||||
|
||||
let initial_count = get_event_count(
|
||||
&format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
|
||||
MemoryEvent::High,
|
||||
)?;
|
||||
|
||||
info!(initial_count, "initial memory.high event count");
|
||||
|
||||
// Hard update `MEMORY_EVENT_COUNT` since there could have been processes
|
||||
// running in the cgroup before that caused it to be non-zero.
|
||||
MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
|
||||
|
||||
Ok((
|
||||
Self {
|
||||
cgroup,
|
||||
upscale_requester,
|
||||
last_upscale_seqnum: AtomicU64::new(0),
|
||||
config: Default::default(),
|
||||
},
|
||||
memory_events,
|
||||
))
|
||||
Ok(Self {
|
||||
cgroup,
|
||||
config: Default::default(),
|
||||
})
|
||||
}
|
||||
|
||||
/// The entrypoint for the `CgroupWatcher`.
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub async fn watch<E>(
|
||||
pub async fn watch(
|
||||
&self,
|
||||
// These are ~dependency injected~ (fancy, I know) because this function
|
||||
// should never return.
|
||||
// -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
|
||||
// -> therefore: if we want to stick it in an Arc so many threads can access
|
||||
// it, methods can never take mutable access.
|
||||
// - note: we use the Arc strategy so that a) we can call this function
|
||||
// right here and b) the runner can call the set/get_memory methods
|
||||
// -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
|
||||
// we just pass them in here instead of holding them in fields, as that
|
||||
// would require this method to take &mut self.
|
||||
mut upscales: mpsc::Receiver<Sequenced<Resources>>,
|
||||
events: E,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
E: Stream<Item = Sequenced<u64>>,
|
||||
{
|
||||
let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
|
||||
let mut last_memory_high_increase_at: Option<Instant> = None;
|
||||
let mut events = pin!(events);
|
||||
|
||||
// Are we waiting to be upscaled? Could be true if we request upscale due
|
||||
// to a memory.high event and it does not arrive in time.
|
||||
let mut waiting_on_upscale = false;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
upscale = upscales.recv() => {
|
||||
let Sequenced { seqnum, data } = upscale
|
||||
.context("failed to listen on upscale notification channel")?;
|
||||
waiting_on_upscale = false;
|
||||
last_memory_high_increase_at = None;
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
}
|
||||
event = events.next() => {
|
||||
let Some(Sequenced { seqnum, .. }) = event else {
|
||||
bail!("failed to listen for memory.high events")
|
||||
};
|
||||
// The memory.high came before our last upscale, so we consider
|
||||
// it resolved
|
||||
if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
// The memory.high came after our latest upscale. We don't
|
||||
// want to do anything yet, so peek the next event in hopes
|
||||
// that it's an upscale.
|
||||
if let Some(upscale_num) = self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
{
|
||||
if upscale_num > seqnum {
|
||||
info!(
|
||||
"received memory.high event, but it came before our last upscale -> ignoring it"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// If it's been long enough since we last froze, freeze the
|
||||
// cgroup and request upscale
|
||||
if wait_to_freeze.is_elapsed() {
|
||||
info!("received memory.high event -> requesting upscale");
|
||||
waiting_on_upscale = self
|
||||
.handle_memory_high_event(&mut upscales)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
wait_to_freeze
|
||||
.as_mut()
|
||||
.reset(Instant::now() + self.config.do_not_freeze_more_often_than);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Ok, we can't freeze, just request upscale
|
||||
if !waiting_on_upscale {
|
||||
info!("received memory.high event, but too soon to refreeze -> requesting upscale");
|
||||
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to request upscaling because we got upscaled");
|
||||
continue;
|
||||
}
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
waiting_on_upscale = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Shoot, we can't freeze or and we're still waiting on upscale,
|
||||
// increase memory.high to reduce throttling
|
||||
let can_increase_memory_high = match last_memory_high_increase_at {
|
||||
None => true,
|
||||
Some(t) => t.elapsed() > self.config.memory_high_increase_every,
|
||||
};
|
||||
if can_increase_memory_high {
|
||||
info!(
|
||||
"received memory.high event, \
|
||||
but too soon to refreeze and already requested upscale \
|
||||
-> increasing memory.high"
|
||||
);
|
||||
|
||||
// Make check to make sure we haven't been upscaled in the
|
||||
// meantine (can happen if the agent independently decides
|
||||
// to upscale us again)
|
||||
if self
|
||||
.upscaled(&mut upscales)
|
||||
.context("failed to check if we were upscaled")?
|
||||
.is_some()
|
||||
{
|
||||
info!("no need to increase memory.high because got upscaled");
|
||||
continue;
|
||||
}
|
||||
|
||||
// Request upscale anyways (the agent will handle deduplicating
|
||||
// requests)
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let memory_high =
|
||||
self.get_memory_high_bytes().context("failed to get memory.high")?;
|
||||
let new_high = memory_high + self.config.memory_high_increase_by_bytes;
|
||||
info!(
|
||||
current_high_bytes = memory_high,
|
||||
new_high_bytes = new_high,
|
||||
"updating memory.high"
|
||||
);
|
||||
self.set_memory_high_bytes(new_high)
|
||||
.context("failed to set memory.high")?;
|
||||
last_memory_high_increase_at = Some(Instant::now());
|
||||
continue;
|
||||
}
|
||||
|
||||
info!("received memory.high event, but can't do anything");
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle a `memory.high`, returning whether we are still waiting on upscale
|
||||
/// by the time the function returns.
|
||||
///
|
||||
/// The general plan for handling a `memory.high` event is as follows:
|
||||
/// 1. Freeze the cgroup
|
||||
/// 2. Start a timer for `self.config.max_upscale_wait`
|
||||
/// 3. Request upscale
|
||||
/// 4. After the timer elapses or we receive upscale, thaw the cgroup.
|
||||
/// 5. Return whether or not we are still waiting for upscale. If we are,
|
||||
/// we'll increase the cgroups memory.high to avoid getting oom killed
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn handle_memory_high_event(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<bool> {
|
||||
// Immediately freeze the cgroup before doing anything else.
|
||||
info!("received memory.high event -> freezing cgroup");
|
||||
self.freeze().context("failed to freeze cgroup")?;
|
||||
|
||||
// We'll use this for logging durations
|
||||
let start_time = Instant::now();
|
||||
|
||||
// Await the upscale until we have to unfreeze
|
||||
let timed =
|
||||
tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
|
||||
|
||||
// Request the upscale
|
||||
info!(
|
||||
wait = ?self.config.max_upscale_wait,
|
||||
"sending request for immediate upscaling",
|
||||
);
|
||||
self.upscale_requester
|
||||
.send(())
|
||||
.await
|
||||
.context("failed to request upscale")?;
|
||||
|
||||
let waiting_on_upscale = match timed.await {
|
||||
Ok(Ok(())) => {
|
||||
info!(elapsed = ?start_time.elapsed(), "received upscale in time");
|
||||
false
|
||||
}
|
||||
// **important**: unfreeze the cgroup before ?-reporting the error
|
||||
Ok(Err(e)) => {
|
||||
info!("error waiting for upscale -> thawing cgroup");
|
||||
self.thaw()
|
||||
.context("failed to thaw cgroup after errored waiting for upscale")?;
|
||||
Err(e.context("failed to await upscale"))?
|
||||
}
|
||||
Err(_) => {
|
||||
info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
|
||||
true
|
||||
}
|
||||
};
|
||||
|
||||
info!("thawing cgroup");
|
||||
self.thaw().context("failed to thaw cgroup")?;
|
||||
|
||||
Ok(waiting_on_upscale)
|
||||
}
|
||||
|
||||
/// Checks whether we were just upscaled, returning the upscale's sequence
|
||||
/// number if so.
|
||||
#[tracing::instrument(skip_all)]
|
||||
fn upscaled(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
) -> anyhow::Result<Option<u64>> {
|
||||
let Sequenced { seqnum, data } = match upscales.try_recv() {
|
||||
Ok(upscale) => upscale,
|
||||
Err(TryRecvError::Empty) => return Ok(None),
|
||||
Err(TryRecvError::Disconnected) => {
|
||||
bail!("upscale notification channel was disconnected")
|
||||
}
|
||||
};
|
||||
|
||||
// Make sure to update the last upscale sequence number
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
|
||||
Ok(Some(seqnum))
|
||||
}
|
||||
|
||||
/// Await an upscale event, discarding any `memory.high` events received in
|
||||
/// the process.
|
||||
///
|
||||
/// This is used in `handle_memory_high_event`, where we need to listen
|
||||
/// for upscales in particular so we know if we can thaw the cgroup early.
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn await_upscale(
|
||||
&self,
|
||||
upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
|
||||
updates: watch::Sender<(Instant, MemoryHistory)>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Sequenced { seqnum, .. } = upscales
|
||||
.recv()
|
||||
.await
|
||||
.context("error listening for upscales")?;
|
||||
// this requirement makes the code a bit easier to work with; see the config for more.
|
||||
assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);
|
||||
|
||||
self.last_upscale_seqnum.store(seqnum, Ordering::Release);
|
||||
Ok(())
|
||||
}
|
||||
let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
|
||||
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
|
||||
// ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0
|
||||
|
||||
/// Get the cgroup's name.
|
||||
pub fn path(&self) -> &str {
|
||||
self.cgroup.path()
|
||||
}
|
||||
}
|
||||
let mem_controller = self.memory()?;
|
||||
|
||||
// Methods for manipulating the actual cgroup
|
||||
impl CgroupWatcher {
|
||||
/// Get a handle on the freezer subsystem.
|
||||
fn freezer(&self) -> anyhow::Result<&FreezerController> {
|
||||
if let Some(Freezer(freezer)) = self
|
||||
.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find(|sub| matches!(sub, Freezer(_)))
|
||||
{
|
||||
Ok(freezer)
|
||||
} else {
|
||||
anyhow::bail!("could not find freezer subsystem")
|
||||
// buffer for samples that will be logged. once full, it remains so.
|
||||
let history_log_len = self.config.memory_history_log_interval;
|
||||
let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
|
||||
|
||||
for t in 0_u64.. {
|
||||
ticker.tick().await;
|
||||
|
||||
let now = Instant::now();
|
||||
let mem = Self::memory_usage(mem_controller);
|
||||
|
||||
let i = t as usize % history_log_len;
|
||||
history_log_buf[i] = mem;
|
||||
|
||||
// We're taking *at most* memory_history_len values; we may be bounded by the total
|
||||
// number of samples that have come in so far.
|
||||
let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
|
||||
// NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
|
||||
// that we just inserted a value there, so the end of the iterator will *include* the
|
||||
// value at i, rather than stopping just short of it.
|
||||
let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
|
||||
|
||||
let summary = MemoryHistory {
|
||||
avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
|
||||
/ samples_count as u64,
|
||||
samples_count,
|
||||
samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
|
||||
};
|
||||
|
||||
// Log the current history if it's time to do so. Because `history_log_buf` has length
|
||||
// equal to the logging interval, we can just log the entire buffer every time we set
|
||||
// the last entry, which also means that for this log line, we can ignore that it's a
|
||||
// ring buffer (because all the entries are in order of increasing time).
|
||||
if i == history_log_len - 1 {
|
||||
info!(
|
||||
history = ?MemoryStatus::debug_slice(&history_log_buf),
|
||||
summary = ?summary,
|
||||
"Recent cgroup memory statistics history"
|
||||
);
|
||||
}
|
||||
|
||||
updates
|
||||
.send((now, summary))
|
||||
.context("failed to send MemoryHistory")?;
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt to freeze the cgroup.
|
||||
pub fn freeze(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.freeze()
|
||||
.context("failed to freeze")
|
||||
}
|
||||
|
||||
/// Attempt to thaw the cgroup.
|
||||
pub fn thaw(&self) -> anyhow::Result<()> {
|
||||
self.freezer()
|
||||
.context("failed to get freezer subsystem")?
|
||||
.thaw()
|
||||
.context("failed to thaw")
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
/// Get a handle on the memory subsystem.
|
||||
///
|
||||
/// Note: this method does not require `self.memory_update_lock` because
|
||||
/// getting a handle to the subsystem does not access any of the files we
|
||||
/// care about, such as memory.high and memory.events
|
||||
fn memory(&self) -> anyhow::Result<&MemController> {
|
||||
if let Some(Mem(memory)) = self
|
||||
.cgroup
|
||||
self.cgroup
|
||||
.subsystems()
|
||||
.iter()
|
||||
.find(|sub| matches!(sub, Mem(_)))
|
||||
{
|
||||
Ok(memory)
|
||||
} else {
|
||||
anyhow::bail!("could not find memory subsystem")
|
||||
}
|
||||
}
|
||||
|
||||
/// Get cgroup current memory usage.
|
||||
pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
|
||||
Ok(self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.memory_stat()
|
||||
.usage_in_bytes)
|
||||
}
|
||||
|
||||
/// Set cgroup memory.high threshold.
|
||||
pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
|
||||
self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
|
||||
}
|
||||
|
||||
/// Set the cgroup's memory.high to 'max', disabling it.
|
||||
pub fn unset_memory_high(&self) -> anyhow::Result<()> {
|
||||
self.set_memory_high_internal(MaxValue::Max)
|
||||
}
|
||||
|
||||
fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
|
||||
self.memory()
|
||||
.context("failed to get memory subsystem")?
|
||||
.set_mem(cgroups_rs::memory::SetMemory {
|
||||
low: None,
|
||||
high: Some(value),
|
||||
min: None,
|
||||
max: None,
|
||||
.find_map(|sub| match sub {
|
||||
Subsystem::Mem(c) => Some(c),
|
||||
_ => None,
|
||||
})
|
||||
.map_err(anyhow::Error::from)
|
||||
.ok_or_else(|| anyhow!("could not find memory subsystem"))
|
||||
}
|
||||
|
||||
/// Get memory.high threshold.
|
||||
pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
|
||||
let high = self
|
||||
.memory()
|
||||
.context("failed to get memory subsystem while getting memory statistics")?
|
||||
.get_mem()
|
||||
.map(|mem| mem.high)
|
||||
.context("failed to get memory statistics from subsystem")?;
|
||||
match high {
|
||||
Some(MaxValue::Max) => Ok(i64::MAX as u64),
|
||||
Some(MaxValue::Value(high)) => Ok(high as u64),
|
||||
None => anyhow::bail!("failed to read memory.high from memory subsystem"),
|
||||
/// Given a handle on the memory subsystem, returns the current memory information
|
||||
fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
|
||||
let stat = mem_controller.memory_stat().stat;
|
||||
MemoryStatus {
|
||||
non_reclaimable: stat.active_anon + stat.inactive_anon,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function for `CgroupWatcher::watch`
|
||||
fn ring_buf_recent_values_iter<T>(
|
||||
buf: &[T],
|
||||
last_value_idx: usize,
|
||||
count: usize,
|
||||
) -> impl '_ + Iterator<Item = &T> {
|
||||
// Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
|
||||
// easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
|
||||
assert!(count <= buf.len());
|
||||
|
||||
buf.iter()
|
||||
// 'cycle' because the values could wrap around
|
||||
.cycle()
|
||||
// with 'cycle', this skip is more like 'offset', and functionally this is
|
||||
// offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
|
||||
// careful to avoid underflow, so we pre-add buf.len().
|
||||
// The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
|
||||
.skip((buf.len() + last_value_idx + 1 - count) % buf.len())
|
||||
.take(count)
|
||||
}
|
||||
|
||||
/// Summary of recent memory usage
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct MemoryHistory {
|
||||
/// Rolling average of non-reclaimable memory usage samples over the last `history_period`
|
||||
pub avg_non_reclaimable: u64,
|
||||
|
||||
/// The number of samples used to construct this summary
|
||||
pub samples_count: usize,
|
||||
/// Total timespan between the first and last sample used for this summary
|
||||
pub samples_span: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct MemoryStatus {
|
||||
non_reclaimable: u64,
|
||||
}
|
||||
|
||||
impl MemoryStatus {
|
||||
fn zeroed() -> Self {
|
||||
MemoryStatus { non_reclaimable: 0 }
|
||||
}
|
||||
|
||||
fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
|
||||
struct DS<'a>(&'a [MemoryStatus]);
|
||||
|
||||
impl<'a> Debug for DS<'a> {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
f.debug_struct("[MemoryStatus]")
|
||||
.field(
|
||||
"non_reclaimable[..]",
|
||||
&Fields(self.0, |stat: &MemoryStatus| {
|
||||
BytesToGB(stat.non_reclaimable)
|
||||
}),
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
struct Fields<'a, F>(&'a [MemoryStatus], F);
|
||||
|
||||
impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
f.debug_list().entries(self.0.iter().map(&self.1)).finish()
|
||||
}
|
||||
}
|
||||
|
||||
struct BytesToGB(u64);
|
||||
|
||||
impl Debug for BytesToGB {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
f.write_fmt(format_args!(
|
||||
"{:.3}Gi",
|
||||
self.0 as f64 / (1_u64 << 30) as f64
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
DS(slice)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn ring_buf_iter() {
|
||||
let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
|
||||
|
||||
let values = |offset, count| {
|
||||
super::ring_buf_recent_values_iter(&buf, offset, count)
|
||||
.copied()
|
||||
.collect::<Vec<i32>>()
|
||||
};
|
||||
|
||||
// Boundary conditions: start, end, and entire thing:
|
||||
assert_eq!(values(0, 1), [0]);
|
||||
assert_eq!(values(3, 4), [0, 1, 2, 3]);
|
||||
assert_eq!(values(9, 4), [6, 7, 8, 9]);
|
||||
assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
|
||||
|
||||
// "normal" operation: no wraparound
|
||||
assert_eq!(values(7, 4), [4, 5, 6, 7]);
|
||||
|
||||
// wraparound:
|
||||
assert_eq!(values(0, 4), [7, 8, 9, 0]);
|
||||
assert_eq!(values(1, 4), [8, 9, 0, 1]);
|
||||
assert_eq!(values(2, 4), [9, 0, 1, 2]);
|
||||
assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,12 +12,10 @@ use futures::{
|
||||
stream::{SplitSink, SplitStream},
|
||||
SinkExt, StreamExt,
|
||||
};
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::info;
|
||||
|
||||
use crate::cgroup::Sequenced;
|
||||
use crate::protocol::{
|
||||
OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
|
||||
OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
|
||||
PROTOCOL_MIN_VERSION,
|
||||
};
|
||||
|
||||
@@ -36,13 +34,6 @@ pub struct Dispatcher {
|
||||
/// We send messages to the agent through `sink`
|
||||
sink: SplitSink<WebSocket, Message>,
|
||||
|
||||
/// Used to notify the cgroup when we are upscaled.
|
||||
pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
|
||||
/// When the cgroup requests upscale it will send on this channel. In response
|
||||
/// we send an `UpscaleRequst` to the agent.
|
||||
pub(crate) request_upscale_events: mpsc::Receiver<()>,
|
||||
|
||||
/// The protocol version we have agreed to use with the agent. This is negotiated
|
||||
/// during the creation of the dispatcher, and should be the highest shared protocol
|
||||
/// version.
|
||||
@@ -61,11 +52,7 @@ impl Dispatcher {
|
||||
/// 1. Wait for the agent to sent the range of protocols it supports.
|
||||
/// 2. Send a protocol version that works for us as well, or an error if there
|
||||
/// is no compatible version.
|
||||
pub async fn new(
|
||||
stream: WebSocket,
|
||||
notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
|
||||
request_upscale_events: mpsc::Receiver<()>,
|
||||
) -> anyhow::Result<Self> {
|
||||
pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
|
||||
let (mut sink, mut source) = stream.split();
|
||||
|
||||
// Figure out the highest protocol version we both support
|
||||
@@ -119,22 +106,10 @@ impl Dispatcher {
|
||||
Ok(Self {
|
||||
sink,
|
||||
source,
|
||||
notify_upscale_events,
|
||||
request_upscale_events,
|
||||
proto_version: highest_shared_version,
|
||||
})
|
||||
}
|
||||
|
||||
/// Notify the cgroup manager that we have received upscale and wait for
|
||||
/// the acknowledgement.
|
||||
#[tracing::instrument(skip_all, fields(?resources))]
|
||||
pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
|
||||
self.notify_upscale_events
|
||||
.send(resources)
|
||||
.await
|
||||
.context("failed to send resources and oneshot sender across channel")
|
||||
}
|
||||
|
||||
/// Send a message to the agent.
|
||||
///
|
||||
/// Although this function is small, it has one major benefit: it is the only
|
||||
|
||||
@@ -21,11 +21,6 @@ pub struct FileCacheState {
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FileCacheConfig {
|
||||
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
|
||||
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
|
||||
/// memory available for the cgroup.
|
||||
pub(crate) in_memory: bool,
|
||||
|
||||
/// The size of the file cache, in terms of the size of the resource it consumes
|
||||
/// (currently: only memory)
|
||||
///
|
||||
@@ -59,22 +54,9 @@ pub struct FileCacheConfig {
|
||||
spread_factor: f64,
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
pub fn default_in_memory() -> Self {
|
||||
impl Default for FileCacheConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
in_memory: true,
|
||||
// 75 %
|
||||
resource_multiplier: 0.75,
|
||||
// 640 MiB; (512 + 128)
|
||||
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
|
||||
// ensure any increase in file cache size is split 90-10 with 10% to other memory
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn default_on_disk() -> Self {
|
||||
Self {
|
||||
in_memory: false,
|
||||
resource_multiplier: 0.75,
|
||||
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
|
||||
// memory, the kernel will just evict from its page cache, rather than e.g. killing
|
||||
@@ -83,7 +65,9 @@ impl FileCacheConfig {
|
||||
spread_factor: 0.1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FileCacheConfig {
|
||||
/// Make sure fields of the config are consistent.
|
||||
pub fn validate(&self) -> anyhow::Result<()> {
|
||||
// Single field validity
|
||||
|
||||
@@ -39,16 +39,6 @@ pub struct Args {
|
||||
#[arg(short, long)]
|
||||
pub pgconnstr: Option<String>,
|
||||
|
||||
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
|
||||
/// kernel's page cache), and therefore should not count against available memory.
|
||||
//
|
||||
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
|
||||
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
|
||||
// during the switch away from an in-memory file cache, we had to default to the previous
|
||||
// behavior.
|
||||
#[arg(long)]
|
||||
pub file_cache_on_disk: bool,
|
||||
|
||||
/// The address we should listen on for connection requests. For the
|
||||
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
|
||||
#[arg(short, long)]
|
||||
|
||||
@@ -5,18 +5,16 @@
|
||||
//! all functionality.
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use axum::extract::ws::{Message, WebSocket};
|
||||
use futures::StreamExt;
|
||||
use tokio::sync::broadcast;
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::sync::{broadcast, watch};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use crate::cgroup::{CgroupWatcher, Sequenced};
|
||||
use crate::cgroup::{self, CgroupWatcher};
|
||||
use crate::dispatcher::Dispatcher;
|
||||
use crate::filecache::{FileCacheConfig, FileCacheState};
|
||||
use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
|
||||
@@ -28,7 +26,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
|
||||
pub struct Runner {
|
||||
config: Config,
|
||||
filecache: Option<FileCacheState>,
|
||||
cgroup: Option<Arc<CgroupWatcher>>,
|
||||
cgroup: Option<CgroupState>,
|
||||
dispatcher: Dispatcher,
|
||||
|
||||
/// We "mint" new message ids by incrementing this counter and taking the value.
|
||||
@@ -45,6 +43,14 @@ pub struct Runner {
|
||||
kill: broadcast::Receiver<()>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct CgroupState {
|
||||
watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
|
||||
/// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
|
||||
/// requests.
|
||||
threshold: u64,
|
||||
}
|
||||
|
||||
/// Configuration for a `Runner`
|
||||
#[derive(Debug)]
|
||||
pub struct Config {
|
||||
@@ -62,16 +68,56 @@ pub struct Config {
|
||||
/// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
|
||||
/// should be removed once we have a better solution there.
|
||||
sys_buffer_bytes: u64,
|
||||
|
||||
/// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
|
||||
/// other words, providing a ceiling for the highest value of the threshold by enforcing that
|
||||
/// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
|
||||
/// threshold.
|
||||
///
|
||||
/// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
|
||||
/// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
|
||||
/// memory.
|
||||
///
|
||||
/// The default value of `0.15` means that we *guarantee* sending upscale requests if the
|
||||
/// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
|
||||
/// memory for the file cache).
|
||||
cgroup_min_overhead_fraction: f64,
|
||||
|
||||
cgroup_downscale_threshold_buffer_bytes: u64,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sys_buffer_bytes: 100 * MiB,
|
||||
cgroup_min_overhead_fraction: 0.15,
|
||||
cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Config {
|
||||
fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
|
||||
// If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
|
||||
// and thus be non-reclaimable, so we should allow for additional memory usage.
|
||||
//
|
||||
// If the file cache sits on disk, our desired stable system state is for it to be fully
|
||||
// page cached (its contents should only be paged to/from disk in situations where we can't
|
||||
// upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
|
||||
// threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
|
||||
// out the file cache.
|
||||
let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
|
||||
|
||||
// Even if we're not separately making room for the file cache (if it's in tmpfs), we still
|
||||
// want our threshold to be met gracefully instead of letting postgres get OOM-killed.
|
||||
// So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
|
||||
// remaining above the threshold.
|
||||
let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
|
||||
|
||||
memory_remaining_for_cgroup.min(max_threshold)
|
||||
}
|
||||
}
|
||||
|
||||
impl Runner {
|
||||
/// Create a new monitor.
|
||||
#[tracing::instrument(skip_all, fields(?config, ?args))]
|
||||
@@ -87,12 +133,7 @@ impl Runner {
|
||||
"invalid monitor Config: sys_buffer_bytes cannot be 0"
|
||||
);
|
||||
|
||||
// *NOTE*: the dispatcher and cgroup manager talk through these channels
|
||||
// so make sure they each get the correct half, nothing is droppped, etc.
|
||||
let (notified_send, notified_recv) = mpsc::channel(1);
|
||||
let (requesting_send, requesting_recv) = mpsc::channel(1);
|
||||
|
||||
let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
|
||||
let dispatcher = Dispatcher::new(ws)
|
||||
.await
|
||||
.context("error creating new dispatcher")?;
|
||||
|
||||
@@ -106,57 +147,18 @@ impl Runner {
|
||||
kill,
|
||||
};
|
||||
|
||||
// If we have both the cgroup and file cache integrations enabled, it's possible for
|
||||
// temporary failures to result in cgroup throttling (from memory.high), that in turn makes
|
||||
// it near-impossible to connect to the file cache (because it times out). Unfortunately,
|
||||
// we *do* still want to determine the file cache size before setting the cgroup's
|
||||
// memory.high, so it's not as simple as just swapping the order.
|
||||
//
|
||||
// Instead, the resolution here is that on vm-monitor startup (note: happens on each
|
||||
// connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
|
||||
// temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
|
||||
// of a hacky solution, but helps with reliability.
|
||||
if let Some(name) = &args.cgroup {
|
||||
// Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
|
||||
// now, and then set limits later.
|
||||
info!("initializing cgroup");
|
||||
|
||||
let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
|
||||
.context("failed to create cgroup manager")?;
|
||||
|
||||
info!("temporarily unsetting memory.high");
|
||||
|
||||
// Temporarily un-set cgroup memory.high; see above.
|
||||
cgroup
|
||||
.unset_memory_high()
|
||||
.context("failed to unset memory.high")?;
|
||||
|
||||
let cgroup = Arc::new(cgroup);
|
||||
|
||||
let cgroup_clone = Arc::clone(&cgroup);
|
||||
spawn_with_cancel(
|
||||
token.clone(),
|
||||
|_| error!("cgroup watcher terminated"),
|
||||
async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
|
||||
);
|
||||
|
||||
state.cgroup = Some(cgroup);
|
||||
}
|
||||
|
||||
let mut file_cache_reserved_bytes = 0;
|
||||
let mem = get_total_system_memory();
|
||||
|
||||
let mut file_cache_disk_size = 0;
|
||||
|
||||
// We need to process file cache initialization before cgroup initialization, so that the memory
|
||||
// allocated to the file cache is appropriately taken into account when we decide the cgroup's
|
||||
// memory limits.
|
||||
if let Some(connstr) = &args.pgconnstr {
|
||||
info!("initializing file cache");
|
||||
let config = match args.file_cache_on_disk {
|
||||
true => FileCacheConfig::default_on_disk(),
|
||||
false => FileCacheConfig::default_in_memory(),
|
||||
};
|
||||
let config = FileCacheConfig::default();
|
||||
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token)
|
||||
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
|
||||
.await
|
||||
.context("failed to create file cache")?;
|
||||
|
||||
@@ -181,23 +183,37 @@ impl Runner {
|
||||
if actual_size != new_size {
|
||||
info!("file cache size actually got set to {actual_size}")
|
||||
}
|
||||
// Mark the resources given to the file cache as reserved, but only if it's in memory.
|
||||
if !args.file_cache_on_disk {
|
||||
file_cache_reserved_bytes = actual_size;
|
||||
}
|
||||
|
||||
file_cache_disk_size = actual_size;
|
||||
state.filecache = Some(file_cache);
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &state.cgroup {
|
||||
let available = mem - file_cache_reserved_bytes;
|
||||
let value = cgroup.config.calculate_memory_high_value(available);
|
||||
if let Some(name) = &args.cgroup {
|
||||
// Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
|
||||
// now, and then set limits later.
|
||||
info!("initializing cgroup");
|
||||
|
||||
info!(value, "setting memory.high");
|
||||
let cgroup =
|
||||
CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;
|
||||
|
||||
cgroup
|
||||
.set_memory_high_bytes(value)
|
||||
.context("failed to set cgroup memory.high")?;
|
||||
let init_value = cgroup::MemoryHistory {
|
||||
avg_non_reclaimable: 0,
|
||||
samples_count: 0,
|
||||
samples_span: Duration::ZERO,
|
||||
};
|
||||
let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
|
||||
|
||||
spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
|
||||
cgroup.watch(hist_tx).await
|
||||
});
|
||||
|
||||
let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
|
||||
info!(threshold, "set initial cgroup threshold",);
|
||||
|
||||
state.cgroup = Some(CgroupState {
|
||||
watcher: hist_rx,
|
||||
threshold,
|
||||
});
|
||||
}
|
||||
|
||||
Ok(state)
|
||||
@@ -217,28 +233,45 @@ impl Runner {
|
||||
|
||||
let requested_mem = target.mem;
|
||||
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
let expected_file_cache_mem_usage = self
|
||||
let expected_file_cache_size = self
|
||||
.filecache
|
||||
.as_ref()
|
||||
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
|
||||
.unwrap_or(0);
|
||||
let mut new_cgroup_mem_high = 0;
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
new_cgroup_mem_high = cgroup
|
||||
let (last_time, last_history) = *cgroup.watcher.borrow();
|
||||
|
||||
// NB: The ordering of these conditions is intentional. During startup, we should deny
|
||||
// downscaling until we have enough information to determine that it's safe to do so
|
||||
// (i.e. enough samples have come in). But if it's been a while and we *still* haven't
|
||||
// received any information, we should *fail* instead of just denying downscaling.
|
||||
//
|
||||
// `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
|
||||
// serves double-duty: it trips if we haven't received *any* metrics for long enough,
|
||||
// OR if we haven't received metrics *recently enough*.
|
||||
//
|
||||
// TODO: make the duration here configurable.
|
||||
if last_time.elapsed() > Duration::from_secs(5) {
|
||||
bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
|
||||
} else if last_history.samples_count <= 1 {
|
||||
let status = "haven't received enough cgroup memory stats yet";
|
||||
info!(status, "discontinuing downscale");
|
||||
return Ok((false, status.to_owned()));
|
||||
}
|
||||
|
||||
let new_threshold = self
|
||||
.config
|
||||
.calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
|
||||
.cgroup_threshold(usable_system_memory, expected_file_cache_size);
|
||||
|
||||
let current = cgroup
|
||||
.current_memory_usage()
|
||||
.context("failed to fetch cgroup memory")?;
|
||||
let current = last_history.avg_non_reclaimable;
|
||||
|
||||
if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
|
||||
if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
|
||||
let status = format!(
|
||||
"{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
|
||||
"calculated memory.high too low",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
"{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
|
||||
"calculated memory threshold too low",
|
||||
bytes_to_mebibytes(new_threshold),
|
||||
bytes_to_mebibytes(current),
|
||||
bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
|
||||
bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
|
||||
);
|
||||
|
||||
info!(status, "discontinuing downscale");
|
||||
@@ -249,42 +282,33 @@ impl Runner {
|
||||
|
||||
// The downscaling has been approved. Downscale the file cache, then the cgroup.
|
||||
let mut status = vec![];
|
||||
let mut file_cache_mem_usage = 0;
|
||||
let mut file_cache_disk_size = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
let actual_usage = file_cache
|
||||
.set_file_cache_size(expected_file_cache_mem_usage)
|
||||
.set_file_cache_size(expected_file_cache_size)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if file_cache.config.in_memory {
|
||||
file_cache_mem_usage = actual_usage;
|
||||
}
|
||||
file_cache_disk_size = actual_usage;
|
||||
let message = format!(
|
||||
"set file cache size to {} MiB (in memory = {})",
|
||||
"set file cache size to {} MiB",
|
||||
bytes_to_mebibytes(actual_usage),
|
||||
file_cache.config.in_memory,
|
||||
);
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
|
||||
if file_cache_mem_usage != expected_file_cache_mem_usage {
|
||||
new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
}
|
||||
|
||||
// new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
|
||||
// since it is properly initialized in the previous cgroup if let block
|
||||
cgroup
|
||||
.set_memory_high_bytes(new_cgroup_mem_high)
|
||||
.context("failed to set cgroup memory.high")?;
|
||||
if let Some(cgroup) = &mut self.cgroup {
|
||||
let new_threshold = self
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, file_cache_disk_size);
|
||||
|
||||
let message = format!(
|
||||
"set cgroup memory.high to {} MiB, of new max {} MiB",
|
||||
bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
bytes_to_mebibytes(available_memory)
|
||||
"set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
|
||||
bytes_to_mebibytes(cgroup.threshold),
|
||||
bytes_to_mebibytes(new_threshold),
|
||||
bytes_to_mebibytes(usable_system_memory)
|
||||
);
|
||||
cgroup.threshold = new_threshold;
|
||||
info!("downscale: {message}");
|
||||
status.push(message);
|
||||
}
|
||||
@@ -305,8 +329,7 @@ impl Runner {
|
||||
let new_mem = resources.mem;
|
||||
let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
|
||||
|
||||
// Get the file cache's expected contribution to the memory usage
|
||||
let mut file_cache_mem_usage = 0;
|
||||
let mut file_cache_disk_size = 0;
|
||||
if let Some(file_cache) = &mut self.filecache {
|
||||
let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
|
||||
info!(
|
||||
@@ -319,9 +342,7 @@ impl Runner {
|
||||
.set_file_cache_size(expected_usage)
|
||||
.await
|
||||
.context("failed to set file cache size")?;
|
||||
if file_cache.config.in_memory {
|
||||
file_cache_mem_usage = actual_usage;
|
||||
}
|
||||
file_cache_disk_size = actual_usage;
|
||||
|
||||
if actual_usage != expected_usage {
|
||||
warn!(
|
||||
@@ -332,18 +353,18 @@ impl Runner {
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(cgroup) = &self.cgroup {
|
||||
let available_memory = usable_system_memory - file_cache_mem_usage;
|
||||
let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
|
||||
if let Some(cgroup) = &mut self.cgroup {
|
||||
let new_threshold = self
|
||||
.config
|
||||
.cgroup_threshold(usable_system_memory, file_cache_disk_size);
|
||||
|
||||
info!(
|
||||
target = bytes_to_mebibytes(new_cgroup_mem_high),
|
||||
total = bytes_to_mebibytes(new_mem),
|
||||
name = cgroup.path(),
|
||||
"updating cgroup memory.high",
|
||||
"set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
|
||||
bytes_to_mebibytes(cgroup.threshold),
|
||||
bytes_to_mebibytes(new_threshold),
|
||||
bytes_to_mebibytes(usable_system_memory)
|
||||
);
|
||||
cgroup
|
||||
.set_memory_high_bytes(new_cgroup_mem_high)
|
||||
.context("failed to set cgroup memory.high")?;
|
||||
cgroup.threshold = new_threshold;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -361,10 +382,6 @@ impl Runner {
|
||||
self.handle_upscale(granted)
|
||||
.await
|
||||
.context("failed to handle upscale")?;
|
||||
self.dispatcher
|
||||
.notify_upscale(Sequenced::new(granted))
|
||||
.await
|
||||
.context("failed to notify notify cgroup of upscale")?;
|
||||
Ok(Some(OutboundMsg::new(
|
||||
OutboundMsgKind::UpscaleConfirmation {},
|
||||
id,
|
||||
@@ -408,33 +425,53 @@ impl Runner {
|
||||
Err(e) => bail!("failed to receive kill signal: {e}")
|
||||
}
|
||||
}
|
||||
// we need to propagate an upscale request
|
||||
request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
|
||||
if request.is_none() {
|
||||
bail!("failed to listen for upscale event from cgroup")
|
||||
|
||||
// New memory stats from the cgroup, *may* need to request upscaling, if we've
|
||||
// exceeded the threshold
|
||||
result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
|
||||
result.context("failed to receive from cgroup memory stats watcher")?;
|
||||
|
||||
let cgroup = self.cgroup.as_ref().unwrap();
|
||||
|
||||
let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
|
||||
|
||||
// If we haven't exceeded the threshold, then we're all ok
|
||||
if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If it's been less than 1 second since the last time we requested upscaling,
|
||||
// ignore the event, to avoid spamming the agent (otherwise, this can happen
|
||||
// ~1k times per second).
|
||||
// Otherwise, we generally want upscaling. But, if it's been less than 1 second
|
||||
// since the last time we requested upscaling, ignore the event, to avoid
|
||||
// spamming the agent.
|
||||
if let Some(t) = self.last_upscale_request_at {
|
||||
let elapsed = t.elapsed();
|
||||
if elapsed < Duration::from_secs(1) {
|
||||
info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
|
||||
info!(
|
||||
elapsed_millis = elapsed.as_millis(),
|
||||
avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
|
||||
threshold = bytes_to_mebibytes(cgroup.threshold),
|
||||
"cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
self.last_upscale_request_at = Some(Instant::now());
|
||||
|
||||
info!("cgroup asking for upscale; forwarding request");
|
||||
info!(
|
||||
avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
|
||||
threshold = bytes_to_mebibytes(cgroup.threshold),
|
||||
"cgroup memory stats are high enough to upscale, requesting upscale",
|
||||
);
|
||||
|
||||
self.counter += 2; // Increment, preserving parity (i.e. keep the
|
||||
// counter odd). See the field comment for more.
|
||||
self.dispatcher
|
||||
.send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
|
||||
.await
|
||||
.context("failed to send message")?;
|
||||
}
|
||||
},
|
||||
|
||||
// there is a message from the agent
|
||||
msg = self.dispatcher.source.next() => {
|
||||
if let Some(msg) = msg {
|
||||
@@ -462,11 +499,14 @@ impl Runner {
|
||||
Ok(Some(out)) => out,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
let error = e.to_string();
|
||||
warn!(?error, "error handling message");
|
||||
// use {:#} for our logging because the display impl only
|
||||
// gives the outermost cause, and the debug impl
|
||||
// pretty-prints the error, whereas {:#} contains all the
|
||||
// causes, but is compact (no newlines).
|
||||
warn!(error = format!("{e:#}"), "error handling message");
|
||||
OutboundMsg::new(
|
||||
OutboundMsgKind::InternalError {
|
||||
error
|
||||
error: e.to_string(),
|
||||
},
|
||||
message.id
|
||||
)
|
||||
|
||||
16
libs/walproposer/Cargo.toml
Normal file
16
libs/walproposer/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "walproposer"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
1
libs/walproposer/bindgen_deps.h
Normal file
1
libs/walproposer/bindgen_deps.h
Normal file
@@ -0,0 +1 @@
|
||||
#include "walproposer.h"
|
||||
113
libs/walproposer/build.rs
Normal file
113
libs/walproposer/build.rs
Normal file
@@ -0,0 +1,113 @@
|
||||
use std::{env, path::PathBuf, process::Command};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bindgen::CargoCallbacks;
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=bindgen_deps.h");
|
||||
|
||||
// Finding the location of built libraries and Postgres C headers:
|
||||
// - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/pg_install`
|
||||
// - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/pg_install/{PG_MAJORVERSION}/include/postgresql/server`
|
||||
let pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
|
||||
postgres_install_dir.into()
|
||||
} else {
|
||||
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pg_install")
|
||||
};
|
||||
|
||||
let pg_install_abs = std::fs::canonicalize(pg_install_dir)?;
|
||||
let walproposer_lib_dir = pg_install_abs.join("build/walproposer-lib");
|
||||
let walproposer_lib_search_str = walproposer_lib_dir
|
||||
.to_str()
|
||||
.ok_or(anyhow!("Bad non-UTF path"))?;
|
||||
|
||||
let pgxn_neon = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../../pgxn/neon");
|
||||
let pgxn_neon = std::fs::canonicalize(pgxn_neon)?;
|
||||
let pgxn_neon = pgxn_neon.to_str().ok_or(anyhow!("Bad non-UTF path"))?;
|
||||
|
||||
println!("cargo:rustc-link-lib=static=pgport");
|
||||
println!("cargo:rustc-link-lib=static=pgcommon");
|
||||
println!("cargo:rustc-link-lib=static=walproposer");
|
||||
println!("cargo:rustc-link-search={walproposer_lib_search_str}");
|
||||
|
||||
let pg_config_bin = pg_install_abs.join("v16").join("bin").join("pg_config");
|
||||
let inc_server_path: String = if pg_config_bin.exists() {
|
||||
let output = Command::new(pg_config_bin)
|
||||
.arg("--includedir-server")
|
||||
.output()
|
||||
.context("failed to execute `pg_config --includedir-server`")?;
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("`pg_config --includedir-server` failed")
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout)
|
||||
.context("pg_config output is not UTF-8")?
|
||||
.trim_end()
|
||||
.into()
|
||||
} else {
|
||||
let server_path = pg_install_abs
|
||||
.join("v16")
|
||||
.join("include")
|
||||
.join("postgresql")
|
||||
.join("server")
|
||||
.into_os_string();
|
||||
server_path
|
||||
.into_string()
|
||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
||||
};
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
let bindings = bindgen::Builder::default()
|
||||
// The input header we would like to generate
|
||||
// bindings for.
|
||||
.header("bindgen_deps.h")
|
||||
// Tell cargo to invalidate the built crate whenever any of the
|
||||
// included header files changed.
|
||||
.parse_callbacks(Box::new(CargoCallbacks))
|
||||
.allowlist_type("WalProposer")
|
||||
.allowlist_type("WalProposerConfig")
|
||||
.allowlist_type("walproposer_api")
|
||||
.allowlist_function("WalProposerCreate")
|
||||
.allowlist_function("WalProposerStart")
|
||||
.allowlist_function("WalProposerBroadcast")
|
||||
.allowlist_function("WalProposerPoll")
|
||||
.allowlist_function("WalProposerFree")
|
||||
.allowlist_var("DEBUG5")
|
||||
.allowlist_var("DEBUG4")
|
||||
.allowlist_var("DEBUG3")
|
||||
.allowlist_var("DEBUG2")
|
||||
.allowlist_var("DEBUG1")
|
||||
.allowlist_var("LOG")
|
||||
.allowlist_var("INFO")
|
||||
.allowlist_var("NOTICE")
|
||||
.allowlist_var("WARNING")
|
||||
.allowlist_var("ERROR")
|
||||
.allowlist_var("FATAL")
|
||||
.allowlist_var("PANIC")
|
||||
.allowlist_var("WPEVENT")
|
||||
.allowlist_var("WL_LATCH_SET")
|
||||
.allowlist_var("WL_SOCKET_READABLE")
|
||||
.allowlist_var("WL_SOCKET_WRITEABLE")
|
||||
.allowlist_var("WL_TIMEOUT")
|
||||
.allowlist_var("WL_SOCKET_CLOSED")
|
||||
.allowlist_var("WL_SOCKET_MASK")
|
||||
.clang_arg("-DWALPROPOSER_LIB")
|
||||
.clang_arg(format!("-I{pgxn_neon}"))
|
||||
.clang_arg(format!("-I{inc_server_path}"))
|
||||
// Finish the builder and generate the bindings.
|
||||
.generate()
|
||||
// Unwrap the Result and panic on failure.
|
||||
.expect("Unable to generate bindings");
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("bindings.rs");
|
||||
bindings
|
||||
.write_to_file(out_path)
|
||||
.expect("Couldn't write bindings!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
455
libs/walproposer/src/api_bindings.rs
Normal file
455
libs/walproposer/src/api_bindings.rs
Normal file
@@ -0,0 +1,455 @@
|
||||
#![allow(dead_code)]
|
||||
|
||||
use std::ffi::CStr;
|
||||
use std::ffi::CString;
|
||||
|
||||
use crate::bindings::uint32;
|
||||
use crate::bindings::walproposer_api;
|
||||
use crate::bindings::PGAsyncReadResult;
|
||||
use crate::bindings::PGAsyncWriteResult;
|
||||
use crate::bindings::Safekeeper;
|
||||
use crate::bindings::Size;
|
||||
use crate::bindings::StringInfoData;
|
||||
use crate::bindings::TimeLineID;
|
||||
use crate::bindings::TimestampTz;
|
||||
use crate::bindings::WalProposer;
|
||||
use crate::bindings::WalProposerConnStatusType;
|
||||
use crate::bindings::WalProposerConnectPollStatusType;
|
||||
use crate::bindings::WalProposerExecStatusType;
|
||||
use crate::bindings::WalproposerShmemState;
|
||||
use crate::bindings::XLogRecPtr;
|
||||
use crate::walproposer::ApiImpl;
|
||||
use crate::walproposer::WaitResult;
|
||||
|
||||
extern "C" fn get_shmem_state(wp: *mut WalProposer) -> *mut WalproposerShmemState {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_shmem_state()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn start_streaming(wp: *mut WalProposer, startpos: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).start_streaming(startpos)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_flush_rec_ptr(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_flush_rec_ptr()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_current_timestamp(wp: *mut WalProposer) -> TimestampTz {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_current_timestamp()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_error_message(sk: *mut Safekeeper) -> *mut ::std::os::raw::c_char {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let msg = (*api).conn_error_message(&mut (*sk));
|
||||
let msg = CString::new(msg).unwrap();
|
||||
// TODO: fix leaking error message
|
||||
msg.into_raw()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_status(sk: *mut Safekeeper) -> WalProposerConnStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_status(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_connect_start(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_connect_start(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_connect_poll(sk: *mut Safekeeper) -> WalProposerConnectPollStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_connect_poll(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_send_query(sk: *mut Safekeeper, query: *mut ::std::os::raw::c_char) -> bool {
|
||||
let query = unsafe { CStr::from_ptr(query) };
|
||||
let query = query.to_str().unwrap();
|
||||
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_send_query(&mut (*sk), query)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_get_query_result(sk: *mut Safekeeper) -> WalProposerExecStatusType {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_get_query_result(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_flush(sk: *mut Safekeeper) -> ::std::os::raw::c_int {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_flush(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_finish(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_finish(&mut (*sk))
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_async_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut *mut ::std::os::raw::c_char,
|
||||
amount: *mut ::std::os::raw::c_int,
|
||||
) -> PGAsyncReadResult {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let (res, result) = (*api).conn_async_read(&mut (*sk));
|
||||
|
||||
// This function has guarantee that returned buf will be valid until
|
||||
// the next call. So we can store a Vec in each Safekeeper and reuse
|
||||
// it on the next call.
|
||||
let mut inbuf = take_vec_u8(&mut (*sk).inbuf).unwrap_or_default();
|
||||
|
||||
inbuf.clear();
|
||||
inbuf.extend_from_slice(res);
|
||||
|
||||
// Put a Vec back to sk->inbuf and return data ptr.
|
||||
*buf = store_vec_u8(&mut (*sk).inbuf, inbuf);
|
||||
*amount = res.len() as i32;
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_async_write(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *const ::std::os::raw::c_void,
|
||||
size: usize,
|
||||
) -> PGAsyncWriteResult {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts(buf as *const u8, size);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_async_write(&mut (*sk), buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn conn_blocking_write(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *const ::std::os::raw::c_void,
|
||||
size: usize,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts(buf as *const u8, size);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).conn_blocking_write(&mut (*sk), buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn recovery_download(
|
||||
sk: *mut Safekeeper,
|
||||
_timeline: TimeLineID,
|
||||
startpos: XLogRecPtr,
|
||||
endpos: XLogRecPtr,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).recovery_download(&mut (*sk), startpos, endpos)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_read(
|
||||
sk: *mut Safekeeper,
|
||||
buf: *mut ::std::os::raw::c_char,
|
||||
startptr: XLogRecPtr,
|
||||
count: Size,
|
||||
) {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, count);
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_read(&mut (*sk), buf, startptr)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wal_reader_allocate(sk: *mut Safekeeper) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).wal_reader_allocate(&mut (*sk));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn free_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).free_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn init_event_set(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).init_event_set(&mut (*wp));
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn update_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).update_event_set(&mut (*sk), events);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn add_safekeeper_event_set(sk: *mut Safekeeper, events: uint32) {
|
||||
unsafe {
|
||||
let callback_data = (*(*(*sk).wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).add_safekeeper_event_set(&mut (*sk), events);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn wait_event_set(
|
||||
wp: *mut WalProposer,
|
||||
timeout: ::std::os::raw::c_long,
|
||||
event_sk: *mut *mut Safekeeper,
|
||||
events: *mut uint32,
|
||||
) -> ::std::os::raw::c_int {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let result = (*api).wait_event_set(&mut (*wp), timeout);
|
||||
match result {
|
||||
WaitResult::Latch => {
|
||||
*event_sk = std::ptr::null_mut();
|
||||
*events = crate::bindings::WL_LATCH_SET;
|
||||
1
|
||||
}
|
||||
WaitResult::Timeout => {
|
||||
*event_sk = std::ptr::null_mut();
|
||||
*events = crate::bindings::WL_TIMEOUT;
|
||||
0
|
||||
}
|
||||
WaitResult::Network(sk, event_mask) => {
|
||||
*event_sk = sk;
|
||||
*events = event_mask;
|
||||
1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn strong_random(
|
||||
wp: *mut WalProposer,
|
||||
buf: *mut ::std::os::raw::c_void,
|
||||
len: usize,
|
||||
) -> bool {
|
||||
unsafe {
|
||||
let buf = std::slice::from_raw_parts_mut(buf as *mut u8, len);
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).strong_random(buf)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn get_redo_start_lsn(wp: *mut WalProposer) -> XLogRecPtr {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).get_redo_start_lsn()
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).finish_sync_safekeepers(lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, commit_lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).process_safekeeper_feedback(&mut (*wp), commit_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn confirm_wal_streamed(wp: *mut WalProposer, lsn: XLogRecPtr) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).confirm_wal_streamed(&mut (*wp), lsn)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn log_internal(
|
||||
wp: *mut WalProposer,
|
||||
level: ::std::os::raw::c_int,
|
||||
line: *const ::std::os::raw::c_char,
|
||||
) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
let line = CStr::from_ptr(line);
|
||||
let line = line.to_str().unwrap();
|
||||
(*api).log_internal(&mut (*wp), Level::from(level as u32), line)
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" fn after_election(wp: *mut WalProposer) {
|
||||
unsafe {
|
||||
let callback_data = (*(*wp).config).callback_data;
|
||||
let api = callback_data as *mut Box<dyn ApiImpl>;
|
||||
(*api).after_election(&mut (*wp))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Level {
|
||||
Debug5,
|
||||
Debug4,
|
||||
Debug3,
|
||||
Debug2,
|
||||
Debug1,
|
||||
Log,
|
||||
Info,
|
||||
Notice,
|
||||
Warning,
|
||||
Error,
|
||||
Fatal,
|
||||
Panic,
|
||||
WPEvent,
|
||||
}
|
||||
|
||||
impl Level {
|
||||
pub fn from(elevel: u32) -> Level {
|
||||
use crate::bindings::*;
|
||||
|
||||
match elevel {
|
||||
DEBUG5 => Level::Debug5,
|
||||
DEBUG4 => Level::Debug4,
|
||||
DEBUG3 => Level::Debug3,
|
||||
DEBUG2 => Level::Debug2,
|
||||
DEBUG1 => Level::Debug1,
|
||||
LOG => Level::Log,
|
||||
INFO => Level::Info,
|
||||
NOTICE => Level::Notice,
|
||||
WARNING => Level::Warning,
|
||||
ERROR => Level::Error,
|
||||
FATAL => Level::Fatal,
|
||||
PANIC => Level::Panic,
|
||||
WPEVENT => Level::WPEvent,
|
||||
_ => panic!("unknown log level {}", elevel),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn create_api() -> walproposer_api {
|
||||
walproposer_api {
|
||||
get_shmem_state: Some(get_shmem_state),
|
||||
start_streaming: Some(start_streaming),
|
||||
get_flush_rec_ptr: Some(get_flush_rec_ptr),
|
||||
get_current_timestamp: Some(get_current_timestamp),
|
||||
conn_error_message: Some(conn_error_message),
|
||||
conn_status: Some(conn_status),
|
||||
conn_connect_start: Some(conn_connect_start),
|
||||
conn_connect_poll: Some(conn_connect_poll),
|
||||
conn_send_query: Some(conn_send_query),
|
||||
conn_get_query_result: Some(conn_get_query_result),
|
||||
conn_flush: Some(conn_flush),
|
||||
conn_finish: Some(conn_finish),
|
||||
conn_async_read: Some(conn_async_read),
|
||||
conn_async_write: Some(conn_async_write),
|
||||
conn_blocking_write: Some(conn_blocking_write),
|
||||
recovery_download: Some(recovery_download),
|
||||
wal_read: Some(wal_read),
|
||||
wal_reader_allocate: Some(wal_reader_allocate),
|
||||
free_event_set: Some(free_event_set),
|
||||
init_event_set: Some(init_event_set),
|
||||
update_event_set: Some(update_event_set),
|
||||
add_safekeeper_event_set: Some(add_safekeeper_event_set),
|
||||
wait_event_set: Some(wait_event_set),
|
||||
strong_random: Some(strong_random),
|
||||
get_redo_start_lsn: Some(get_redo_start_lsn),
|
||||
finish_sync_safekeepers: Some(finish_sync_safekeepers),
|
||||
process_safekeeper_feedback: Some(process_safekeeper_feedback),
|
||||
confirm_wal_streamed: Some(confirm_wal_streamed),
|
||||
log_internal: Some(log_internal),
|
||||
after_election: Some(after_election),
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Level {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
write!(f, "{:?}", self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Take ownership of `Vec<u8>` from StringInfoData.
|
||||
pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
|
||||
if pg.data.is_null() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ptr = pg.data as *mut u8;
|
||||
let length = pg.len as usize;
|
||||
let capacity = pg.maxlen as usize;
|
||||
|
||||
pg.data = std::ptr::null_mut();
|
||||
pg.len = 0;
|
||||
pg.maxlen = 0;
|
||||
|
||||
unsafe { Some(Vec::from_raw_parts(ptr, length, capacity)) }
|
||||
}
|
||||
|
||||
/// Store `Vec<u8>` in StringInfoData.
|
||||
fn store_vec_u8(pg: &mut StringInfoData, vec: Vec<u8>) -> *mut ::std::os::raw::c_char {
|
||||
let ptr = vec.as_ptr() as *mut ::std::os::raw::c_char;
|
||||
let length = vec.len();
|
||||
let capacity = vec.capacity();
|
||||
|
||||
assert!(pg.data.is_null());
|
||||
|
||||
pg.data = ptr;
|
||||
pg.len = length as i32;
|
||||
pg.maxlen = capacity as i32;
|
||||
|
||||
std::mem::forget(vec);
|
||||
|
||||
ptr
|
||||
}
|
||||
14
libs/walproposer/src/lib.rs
Normal file
14
libs/walproposer/src/lib.rs
Normal file
@@ -0,0 +1,14 @@
|
||||
pub mod bindings {
|
||||
#![allow(non_upper_case_globals)]
|
||||
#![allow(non_camel_case_types)]
|
||||
#![allow(non_snake_case)]
|
||||
// bindgen creates some unsafe code with no doc comments.
|
||||
#![allow(clippy::missing_safety_doc)]
|
||||
// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
|
||||
#![allow(clippy::useless_transmute)]
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
|
||||
}
|
||||
|
||||
pub mod api_bindings;
|
||||
pub mod walproposer;
|
||||
485
libs/walproposer/src/walproposer.rs
Normal file
485
libs/walproposer/src/walproposer.rs
Normal file
@@ -0,0 +1,485 @@
|
||||
use std::ffi::CString;
|
||||
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{
|
||||
api_bindings::{create_api, take_vec_u8, Level},
|
||||
bindings::{
|
||||
Safekeeper, WalProposer, WalProposerConfig, WalProposerCreate, WalProposerFree,
|
||||
WalProposerStart,
|
||||
},
|
||||
};
|
||||
|
||||
/// Rust high-level wrapper for C walproposer API. Many methods are not required
|
||||
/// for simple cases, hence todo!() in default implementations.
|
||||
///
|
||||
/// Refer to `pgxn/neon/walproposer.h` for documentation.
|
||||
pub trait ApiImpl {
|
||||
fn get_shmem_state(&self) -> &mut crate::bindings::WalproposerShmemState {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn start_streaming(&self, _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_flush_rec_ptr(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_error_message(&self, _sk: &mut Safekeeper) -> String {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_status(&self, _sk: &mut Safekeeper) -> crate::bindings::WalProposerConnStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_connect_start(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_connect_poll(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnectPollStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_send_query(&self, _sk: &mut Safekeeper, _query: &str) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_get_query_result(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
) -> crate::bindings::WalProposerExecStatusType {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_flush(&self, _sk: &mut Safekeeper) -> i32 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_finish(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_async_read(&self, _sk: &mut Safekeeper) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_async_write(
|
||||
&self,
|
||||
_sk: &mut Safekeeper,
|
||||
_buf: &[u8],
|
||||
) -> crate::bindings::PGAsyncWriteResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn conn_blocking_write(&self, _sk: &mut Safekeeper, _buf: &[u8]) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn recovery_download(&self, _sk: &mut Safekeeper, _startpos: u64, _endpos: u64) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_read(&self, _sk: &mut Safekeeper, _buf: &mut [u8], _startpos: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _sk: &mut Safekeeper) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn update_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, _sk: &mut Safekeeper, _events_mask: u32) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn wait_event_set(&self, _wp: &mut WalProposer, _timeout_millis: i64) -> WaitResult {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn strong_random(&self, _buf: &mut [u8]) -> bool {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn get_redo_start_lsn(&self) -> u64 {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn finish_sync_safekeepers(&self, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn process_safekeeper_feedback(&self, _wp: &mut WalProposer, _commit_lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn confirm_wal_streamed(&self, _wp: &mut WalProposer, _lsn: u64) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut WalProposer, _level: Level, _msg: &str) {
|
||||
todo!()
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut WalProposer) {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WaitResult {
|
||||
Latch,
|
||||
Timeout,
|
||||
Network(*mut Safekeeper, u32),
|
||||
}
|
||||
|
||||
pub struct Config {
|
||||
/// Tenant and timeline id
|
||||
pub ttid: TenantTimelineId,
|
||||
/// List of safekeepers in format `host:port`
|
||||
pub safekeepers_list: Vec<String>,
|
||||
/// Safekeeper reconnect timeout in milliseconds
|
||||
pub safekeeper_reconnect_timeout: i32,
|
||||
/// Safekeeper connection timeout in milliseconds
|
||||
pub safekeeper_connection_timeout: i32,
|
||||
/// walproposer mode, finish when all safekeepers are synced or subscribe
|
||||
/// to WAL streaming
|
||||
pub sync_safekeepers: bool,
|
||||
}
|
||||
|
||||
/// WalProposer main struct. C methods are reexported as Rust functions.
|
||||
pub struct Wrapper {
|
||||
wp: *mut WalProposer,
|
||||
_safekeepers_list_vec: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Wrapper {
|
||||
pub fn new(api: Box<dyn ApiImpl>, config: Config) -> Wrapper {
|
||||
let neon_tenant = CString::new(config.ttid.tenant_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
let neon_timeline = CString::new(config.ttid.timeline_id.to_string())
|
||||
.unwrap()
|
||||
.into_raw();
|
||||
|
||||
let mut safekeepers_list_vec = CString::new(config.safekeepers_list.join(","))
|
||||
.unwrap()
|
||||
.into_bytes_with_nul();
|
||||
assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
|
||||
let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
|
||||
|
||||
let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;
|
||||
|
||||
let c_config = WalProposerConfig {
|
||||
neon_tenant,
|
||||
neon_timeline,
|
||||
safekeepers_list,
|
||||
safekeeper_reconnect_timeout: config.safekeeper_reconnect_timeout,
|
||||
safekeeper_connection_timeout: config.safekeeper_connection_timeout,
|
||||
wal_segment_size: WAL_SEGMENT_SIZE as i32, // default 16MB
|
||||
syncSafekeepers: config.sync_safekeepers,
|
||||
systemId: 0,
|
||||
pgTimeline: 1,
|
||||
callback_data,
|
||||
};
|
||||
let c_config = Box::into_raw(Box::new(c_config));
|
||||
|
||||
let api = create_api();
|
||||
let wp = unsafe { WalProposerCreate(c_config, api) };
|
||||
Wrapper {
|
||||
wp,
|
||||
_safekeepers_list_vec: safekeepers_list_vec,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start(&self) {
|
||||
unsafe { WalProposerStart(self.wp) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Wrapper {
|
||||
fn drop(&mut self) {
|
||||
unsafe {
|
||||
let config = (*self.wp).config;
|
||||
drop(Box::from_raw(
|
||||
(*config).callback_data as *mut Box<dyn ApiImpl>,
|
||||
));
|
||||
drop(CString::from_raw((*config).neon_tenant));
|
||||
drop(CString::from_raw((*config).neon_timeline));
|
||||
drop(Box::from_raw(config));
|
||||
|
||||
for i in 0..(*self.wp).n_safekeepers {
|
||||
let sk = &mut (*self.wp).safekeeper[i as usize];
|
||||
take_vec_u8(&mut sk.inbuf);
|
||||
}
|
||||
|
||||
WalProposerFree(self.wp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{
|
||||
cell::Cell,
|
||||
sync::{atomic::AtomicUsize, mpsc::sync_channel},
|
||||
};
|
||||
|
||||
use utils::id::TenantTimelineId;
|
||||
|
||||
use crate::{api_bindings::Level, walproposer::Wrapper};
|
||||
|
||||
use super::ApiImpl;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
struct WaitEventsData {
|
||||
sk: *mut crate::bindings::Safekeeper,
|
||||
event_mask: u32,
|
||||
}
|
||||
|
||||
struct MockImpl {
|
||||
// data to return from wait_event_set
|
||||
wait_events: Cell<WaitEventsData>,
|
||||
// walproposer->safekeeper messages
|
||||
expected_messages: Vec<Vec<u8>>,
|
||||
expected_ptr: AtomicUsize,
|
||||
// safekeeper->walproposer messages
|
||||
safekeeper_replies: Vec<Vec<u8>>,
|
||||
replies_ptr: AtomicUsize,
|
||||
// channel to send LSN to the main thread
|
||||
sync_channel: std::sync::mpsc::SyncSender<u64>,
|
||||
}
|
||||
|
||||
impl MockImpl {
|
||||
fn check_walproposer_msg(&self, msg: &[u8]) {
|
||||
let ptr = self
|
||||
.expected_ptr
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
if ptr >= self.expected_messages.len() {
|
||||
panic!("unexpected message from walproposer");
|
||||
}
|
||||
|
||||
let expected_msg = &self.expected_messages[ptr];
|
||||
assert_eq!(msg, expected_msg.as_slice());
|
||||
}
|
||||
|
||||
fn next_safekeeper_reply(&self) -> &[u8] {
|
||||
let ptr = self
|
||||
.replies_ptr
|
||||
.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
|
||||
if ptr >= self.safekeeper_replies.len() {
|
||||
panic!("no more safekeeper replies");
|
||||
}
|
||||
|
||||
&self.safekeeper_replies[ptr]
|
||||
}
|
||||
}
|
||||
|
||||
impl ApiImpl for MockImpl {
|
||||
fn get_current_timestamp(&self) -> i64 {
|
||||
println!("get_current_timestamp");
|
||||
0
|
||||
}
|
||||
|
||||
fn conn_status(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnStatusType {
|
||||
println!("conn_status");
|
||||
crate::bindings::WalProposerConnStatusType_WP_CONNECTION_OK
|
||||
}
|
||||
|
||||
fn conn_connect_start(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("conn_connect_start");
|
||||
}
|
||||
|
||||
fn conn_connect_poll(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerConnectPollStatusType {
|
||||
println!("conn_connect_poll");
|
||||
crate::bindings::WalProposerConnectPollStatusType_WP_CONN_POLLING_OK
|
||||
}
|
||||
|
||||
fn conn_send_query(&self, _: &mut crate::bindings::Safekeeper, query: &str) -> bool {
|
||||
println!("conn_send_query: {}", query);
|
||||
true
|
||||
}
|
||||
|
||||
fn conn_get_query_result(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> crate::bindings::WalProposerExecStatusType {
|
||||
println!("conn_get_query_result");
|
||||
crate::bindings::WalProposerExecStatusType_WP_EXEC_SUCCESS_COPYBOTH
|
||||
}
|
||||
|
||||
fn conn_async_read(
|
||||
&self,
|
||||
_: &mut crate::bindings::Safekeeper,
|
||||
) -> (&[u8], crate::bindings::PGAsyncReadResult) {
|
||||
println!("conn_async_read");
|
||||
let reply = self.next_safekeeper_reply();
|
||||
println!("conn_async_read result: {:?}", reply);
|
||||
(
|
||||
reply,
|
||||
crate::bindings::PGAsyncReadResult_PG_ASYNC_READ_SUCCESS,
|
||||
)
|
||||
}
|
||||
|
||||
fn conn_blocking_write(&self, _: &mut crate::bindings::Safekeeper, buf: &[u8]) -> bool {
|
||||
println!("conn_blocking_write: {:?}", buf);
|
||||
self.check_walproposer_msg(buf);
|
||||
true
|
||||
}
|
||||
|
||||
fn wal_reader_allocate(&self, _: &mut crate::bindings::Safekeeper) {
|
||||
println!("wal_reader_allocate")
|
||||
}
|
||||
|
||||
fn free_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("free_event_set")
|
||||
}
|
||||
|
||||
fn init_event_set(&self, _: &mut crate::bindings::WalProposer) {
|
||||
println!("init_event_set")
|
||||
}
|
||||
|
||||
fn update_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
|
||||
println!(
|
||||
"update_event_set, sk={:?}, events_mask={:#b}",
|
||||
sk as *mut crate::bindings::Safekeeper, event_mask
|
||||
);
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn add_safekeeper_event_set(&self, sk: &mut crate::bindings::Safekeeper, event_mask: u32) {
|
||||
println!(
|
||||
"add_safekeeper_event_set, sk={:?}, events_mask={:#b}",
|
||||
sk as *mut crate::bindings::Safekeeper, event_mask
|
||||
);
|
||||
self.wait_events.set(WaitEventsData { sk, event_mask });
|
||||
}
|
||||
|
||||
fn wait_event_set(
|
||||
&self,
|
||||
_: &mut crate::bindings::WalProposer,
|
||||
timeout_millis: i64,
|
||||
) -> super::WaitResult {
|
||||
let data = self.wait_events.get();
|
||||
println!(
|
||||
"wait_event_set, timeout_millis={}, res={:?}",
|
||||
timeout_millis, data
|
||||
);
|
||||
super::WaitResult::Network(data.sk, data.event_mask)
|
||||
}
|
||||
|
||||
fn strong_random(&self, buf: &mut [u8]) -> bool {
|
||||
println!("strong_random");
|
||||
buf.fill(0);
|
||||
true
|
||||
}
|
||||
|
||||
fn finish_sync_safekeepers(&self, lsn: u64) {
|
||||
self.sync_channel.send(lsn).unwrap();
|
||||
panic!("sync safekeepers finished at lsn={}", lsn);
|
||||
}
|
||||
|
||||
fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
|
||||
println!("walprop_log[{}] {}", level, msg);
|
||||
}
|
||||
|
||||
fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
|
||||
println!("after_election");
|
||||
}
|
||||
}
|
||||
|
||||
/// Test that walproposer can successfully connect to safekeeper and finish
|
||||
/// sync_safekeepers. API is mocked in MockImpl.
|
||||
///
|
||||
/// Run this test with valgrind to detect leaks:
|
||||
/// `valgrind --leak-check=full target/debug/deps/walproposer-<build>`
|
||||
#[test]
|
||||
fn test_simple_sync_safekeepers() -> anyhow::Result<()> {
|
||||
let ttid = TenantTimelineId::new(
|
||||
"9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
|
||||
"9e4c8f36063c6c6e93bc20d65a820f3d".parse()?,
|
||||
);
|
||||
|
||||
let (sender, receiver) = sync_channel(1);
|
||||
|
||||
let my_impl: Box<dyn ApiImpl> = Box::new(MockImpl {
|
||||
wait_events: Cell::new(WaitEventsData {
|
||||
sk: std::ptr::null_mut(),
|
||||
event_mask: 0,
|
||||
}),
|
||||
expected_messages: vec![
|
||||
// Greeting(ProposerGreeting { protocol_version: 2, pg_version: 160000, proposer_id: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], system_id: 0, timeline_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tenant_id: 9e4c8f36063c6c6e93bc20d65a820f3d, tli: 1, wal_seg_size: 16777216 })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 113, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 158, 76, 143, 54, 6, 60, 108, 110,
|
||||
147, 188, 32, 214, 90, 130, 15, 61, 158, 76, 143, 54, 6, 60, 108, 110, 147,
|
||||
188, 32, 214, 90, 130, 15, 61, 1, 0, 0, 0, 0, 0, 0, 1,
|
||||
],
|
||||
// VoteRequest(VoteRequest { term: 3 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
expected_ptr: AtomicUsize::new(0),
|
||||
safekeeper_replies: vec![
|
||||
// Greeting(AcceptorGreeting { term: 2, node_id: NodeId(1) })
|
||||
vec![
|
||||
103, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
// VoteResponse(VoteResponse { term: 3, vote_given: 1, flush_lsn: 0/539, truncate_lsn: 0/539, term_history: [(2, 0/539)], timeline_start_lsn: 0/539 })
|
||||
vec![
|
||||
118, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 57,
|
||||
5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
|
||||
0, 57, 5, 0, 0, 0, 0, 0, 0, 57, 5, 0, 0, 0, 0, 0, 0,
|
||||
],
|
||||
],
|
||||
replies_ptr: AtomicUsize::new(0),
|
||||
sync_channel: sender,
|
||||
});
|
||||
let config = crate::walproposer::Config {
|
||||
ttid,
|
||||
safekeepers_list: vec!["localhost:5000".to_string()],
|
||||
safekeeper_reconnect_timeout: 1000,
|
||||
safekeeper_connection_timeout: 10000,
|
||||
sync_safekeepers: true,
|
||||
};
|
||||
|
||||
let wp = Wrapper::new(my_impl, config);
|
||||
|
||||
// walproposer will panic when it finishes sync_safekeepers
|
||||
std::panic::catch_unwind(|| wp.start()).unwrap_err();
|
||||
// validate the resulting LSN
|
||||
assert_eq!(receiver.recv()?, 1337);
|
||||
Ok(())
|
||||
// drop() will free up resources here
|
||||
}
|
||||
}
|
||||
@@ -11,10 +11,7 @@ use std::sync::{Arc, Barrier};
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
repository::Key,
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::{PostgresRedoManager, WalRedoError},
|
||||
config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
|
||||
};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
@@ -35,9 +32,15 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
tracing::info!("executing first");
|
||||
short().execute(&manager).unwrap();
|
||||
tracing::info!("first executed");
|
||||
{
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
tracing::info!("executing first");
|
||||
short().execute(rt.handle(), &manager).unwrap();
|
||||
tracing::info!("first executed");
|
||||
}
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16];
|
||||
|
||||
@@ -80,9 +83,14 @@ fn add_multithreaded_walredo_requesters(
|
||||
assert_ne!(threads, 0);
|
||||
|
||||
if threads == 1 {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
b.iter_batched_ref(
|
||||
|| Some(input_factory()),
|
||||
|input| execute_all(input.take(), manager),
|
||||
|input| execute_all(input.take(), handle, manager),
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
} else {
|
||||
@@ -98,19 +106,26 @@ fn add_multithreaded_walredo_requesters(
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
let work_rx = work_rx.clone();
|
||||
move || loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
move || {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
let handle = rt.handle();
|
||||
loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
}
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, handle, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
}
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
}
|
||||
})
|
||||
})
|
||||
@@ -152,15 +167,19 @@ impl Drop for JoinOnDrop {
|
||||
}
|
||||
}
|
||||
|
||||
fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError>
|
||||
fn execute_all<I>(
|
||||
input: I,
|
||||
handle: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = Request>,
|
||||
{
|
||||
// just fire all requests as fast as possible
|
||||
input.into_iter().try_for_each(|req| {
|
||||
let page = req.execute(manager)?;
|
||||
let page = req.execute(handle, manager)?;
|
||||
assert_eq!(page.remaining(), 8192);
|
||||
Ok::<_, WalRedoError>(())
|
||||
anyhow::Ok(())
|
||||
})
|
||||
}
|
||||
|
||||
@@ -473,9 +492,11 @@ struct Request {
|
||||
}
|
||||
|
||||
impl Request {
|
||||
fn execute(self, manager: &PostgresRedoManager) -> Result<Bytes, WalRedoError> {
|
||||
use pageserver::walredo::WalRedoManager;
|
||||
|
||||
fn execute(
|
||||
self,
|
||||
rt: &tokio::runtime::Handle,
|
||||
manager: &PostgresRedoManager,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let Request {
|
||||
key,
|
||||
lsn,
|
||||
@@ -484,6 +505,6 @@ impl Request {
|
||||
pg_version,
|
||||
} = self;
|
||||
|
||||
manager.request_redo(key, lsn, base_img, records, pg_version)
|
||||
rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use fail::fail_point;
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
use tokio::io;
|
||||
@@ -180,6 +181,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
let mut min_restart_lsn: Lsn = Lsn::MAX;
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
self.timeline.list_dbdirs(self.lsn, self.ctx).await?
|
||||
@@ -213,6 +215,34 @@ where
|
||||
self.add_rel(rel, rel).await?;
|
||||
}
|
||||
}
|
||||
|
||||
for (path, content) in self.timeline.list_aux_files(self.lsn, self.ctx).await? {
|
||||
if path.starts_with("pg_replslot") {
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
));
|
||||
info!("Replication slot {} restart LSN={}", path, restart_lsn);
|
||||
min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
|
||||
}
|
||||
let header = new_tar_header(&path, content.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &*content)
|
||||
.await
|
||||
.context("could not add aux file to basebackup tarball")?;
|
||||
}
|
||||
}
|
||||
if min_restart_lsn != Lsn::MAX {
|
||||
info!(
|
||||
"Min restart LSN for logical replication is {}",
|
||||
min_restart_lsn
|
||||
);
|
||||
let data = min_restart_lsn.0.to_le_bytes();
|
||||
let header = new_tar_header("restart.lsn", data.len() as u64)?;
|
||||
self.ar
|
||||
.append(&header, &data[..])
|
||||
.await
|
||||
.context("could not add restart.lsn file to basebackup tarball")?;
|
||||
}
|
||||
for xid in self
|
||||
.timeline
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use std::env::{var, VarError};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{env, ops::ControlFlow, str::FromStr};
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
@@ -33,11 +34,12 @@ use postgres_backend::AuthType;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
|
||||
tcp_listener,
|
||||
auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
|
||||
signals::Signal, tcp_listener,
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
const PID_FILE_NAME: &str = "pageserver.pid";
|
||||
|
||||
@@ -200,6 +202,51 @@ fn initialize_config(
|
||||
})
|
||||
}
|
||||
|
||||
struct WaitForPhaseResult<F: std::future::Future + Unpin> {
|
||||
timeout_remaining: Duration,
|
||||
skipped: Option<F>,
|
||||
}
|
||||
|
||||
/// During startup, we apply a timeout to our waits for readiness, to avoid
|
||||
/// stalling the whole service if one Tenant experiences some problem. Each
|
||||
/// phase may consume some of the timeout: this function returns the updated
|
||||
/// timeout for use in the next call.
|
||||
async fn wait_for_phase<F>(phase: &str, mut fut: F, timeout: Duration) -> WaitForPhaseResult<F>
|
||||
where
|
||||
F: std::future::Future + Unpin,
|
||||
{
|
||||
let initial_t = Instant::now();
|
||||
let skipped = match tokio::time::timeout(timeout, &mut fut).await {
|
||||
Ok(_) => None,
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
%phase,
|
||||
"Startup phase timed out, proceeding anyway"
|
||||
);
|
||||
Some(fut)
|
||||
}
|
||||
};
|
||||
|
||||
WaitForPhaseResult {
|
||||
timeout_remaining: timeout
|
||||
.checked_sub(Instant::now().duration_since(initial_t))
|
||||
.unwrap_or(Duration::ZERO),
|
||||
skipped,
|
||||
}
|
||||
}
|
||||
|
||||
fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
|
||||
let elapsed = started_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
}
|
||||
|
||||
fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
@@ -207,26 +254,17 @@ fn start_pageserver(
|
||||
// Monotonic time for later calculating startup duration
|
||||
let started_startup_at = Instant::now();
|
||||
|
||||
let startup_checkpoint = move |phase: &str, human_phase: &str| {
|
||||
let elapsed = started_startup_at.elapsed();
|
||||
let secs = elapsed.as_secs_f64();
|
||||
STARTUP_DURATION.with_label_values(&[phase]).set(secs);
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"{human_phase} ({secs:.3}s since start)"
|
||||
)
|
||||
};
|
||||
|
||||
// Print version and launch timestamp to the log,
|
||||
// and expose them as prometheus metrics.
|
||||
// A changed version string indicates changed software.
|
||||
// A changed launch timestamp indicates a pageserver restart.
|
||||
info!(
|
||||
"version: {} launch_timestamp: {}",
|
||||
"version: {} launch_timestamp: {} build_tag: {}",
|
||||
version(),
|
||||
launch_ts.to_string()
|
||||
launch_ts.to_string(),
|
||||
BUILD_TAG,
|
||||
);
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
set_build_info_metric(GIT_VERSION, BUILD_TAG);
|
||||
set_launch_timestamp_metric(launch_ts);
|
||||
pageserver::preinitialize_metrics();
|
||||
|
||||
@@ -341,7 +379,7 @@ fn start_pageserver(
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
startup_checkpoint("initial", "Starting loading tenants");
|
||||
startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
|
||||
STARTUP_IS_LOADING.set(1);
|
||||
|
||||
// Startup staging or optimizing:
|
||||
@@ -355,6 +393,7 @@ fn start_pageserver(
|
||||
// consumer side) will be dropped once we can start the background jobs. Currently it is behind
|
||||
// completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
|
||||
// (background_task_maximum_delay).
|
||||
let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
|
||||
let (init_done_tx, init_done_rx) = utils::completion::channel();
|
||||
|
||||
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
|
||||
@@ -362,7 +401,8 @@ fn start_pageserver(
|
||||
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
|
||||
|
||||
let order = pageserver::InitializationOrder {
|
||||
initial_tenant_load: Some(init_done_tx),
|
||||
initial_tenant_load_remote: Some(init_done_tx),
|
||||
initial_tenant_load: Some(init_remote_done_tx),
|
||||
initial_logical_size_can_start: init_done_rx.clone(),
|
||||
initial_logical_size_attempt: Some(init_logical_size_done_tx),
|
||||
background_jobs_can_start: background_jobs_barrier.clone(),
|
||||
@@ -386,55 +426,93 @@ fn start_pageserver(
|
||||
let shutdown_pageserver = shutdown_pageserver.clone();
|
||||
let drive_init = async move {
|
||||
// NOTE: unlike many futures in pageserver, this one is cancellation-safe
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial load completed")
|
||||
});
|
||||
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint("initial_tenant_load", "Initial load completed");
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
|
||||
let init_remote_done = std::pin::pin!(async {
|
||||
init_remote_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_tenant_load_remote",
|
||||
"Remote part of initial load completed",
|
||||
);
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: timeout,
|
||||
skipped: init_remote_skipped,
|
||||
} = wait_for_phase("initial_tenant_load_remote", init_remote_done, timeout).await;
|
||||
|
||||
let init_load_done = std::pin::pin!(async {
|
||||
init_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_tenant_load",
|
||||
"Initial load completed",
|
||||
);
|
||||
STARTUP_IS_LOADING.set(0);
|
||||
});
|
||||
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: timeout,
|
||||
skipped: init_load_skipped,
|
||||
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
|
||||
|
||||
// initial logical sizes can now start, as they were waiting on init_done_rx.
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before initial logical sizes completed")
|
||||
});
|
||||
|
||||
let timeout = conf.background_task_maximum_delay;
|
||||
let logical_sizes_done = std::pin::pin!(async {
|
||||
init_logical_size_done_rx.wait().await;
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"initial_logical_sizes",
|
||||
"Initial logical sizes completed",
|
||||
);
|
||||
});
|
||||
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
|
||||
let init_sizes_done = match tokio::time::timeout(timeout, &mut init_sizes_done).await {
|
||||
Ok(_) => {
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed");
|
||||
None
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::info!(
|
||||
timeout_millis = timeout.as_millis(),
|
||||
"Initial logical size timeout elapsed; starting background jobs"
|
||||
);
|
||||
Some(init_sizes_done)
|
||||
}
|
||||
};
|
||||
let WaitForPhaseResult {
|
||||
timeout_remaining: _,
|
||||
skipped: logical_sizes_skipped,
|
||||
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
// allow background jobs to start
|
||||
// allow background jobs to start: we either completed prior stages, or they reached timeout
|
||||
// and were skipped. It is important that we do not let them block background jobs indefinitely,
|
||||
// because things like consumption metrics for billing are blocked by this barrier.
|
||||
drop(background_jobs_can_start);
|
||||
startup_checkpoint("background_jobs_can_start", "Starting background jobs");
|
||||
|
||||
if let Some(init_sizes_done) = init_sizes_done {
|
||||
// ending up here is not a bug; at the latest logical sizes will be queried by
|
||||
// consumption metrics.
|
||||
let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
|
||||
init_sizes_done.await;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint("initial_logical_sizes", "Initial logical sizes completed after timeout (background jobs already started)");
|
||||
startup_checkpoint(
|
||||
started_startup_at,
|
||||
"background_jobs_can_start",
|
||||
"Starting background jobs",
|
||||
);
|
||||
|
||||
// We are done. If we skipped any phases due to timeout, run them to completion here so that
|
||||
// they will eventually update their startup_checkpoint, and so that we do not declare the
|
||||
// 'complete' stage until all the other stages are really done.
|
||||
let guard = scopeguard::guard_on_success((), |_| {
|
||||
tracing::info!("Cancelled before waiting for skipped phases done")
|
||||
});
|
||||
if let Some(f) = init_remote_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = init_load_skipped {
|
||||
f.await;
|
||||
}
|
||||
if let Some(f) = logical_sizes_skipped {
|
||||
f.await;
|
||||
}
|
||||
scopeguard::ScopeGuard::into_inner(guard);
|
||||
|
||||
startup_checkpoint("complete", "Startup complete");
|
||||
startup_checkpoint(started_startup_at, "complete", "Startup complete");
|
||||
};
|
||||
|
||||
async move {
|
||||
@@ -574,6 +652,7 @@ fn start_pageserver(
|
||||
pageserver_listener,
|
||||
conf.pg_auth_type,
|
||||
libpq_ctx,
|
||||
task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
},
|
||||
|
||||
@@ -33,8 +33,7 @@ use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
|
||||
use crate::tenant::config::TenantConf;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{
|
||||
TENANTS_SEGMENT_NAME, TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME,
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
|
||||
};
|
||||
use crate::{
|
||||
IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
|
||||
@@ -211,6 +210,10 @@ pub struct PageServerConf {
|
||||
|
||||
/// JWT token for use with the control plane API.
|
||||
pub control_plane_api_token: Option<SecretString>,
|
||||
|
||||
/// If true, pageserver will make best-effort to operate without a control plane: only
|
||||
/// for use in major incidents.
|
||||
pub control_plane_emergency_mode: bool,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -288,6 +291,7 @@ struct PageServerConfigBuilder {
|
||||
|
||||
control_plane_api: BuilderValue<Option<Url>>,
|
||||
control_plane_api_token: BuilderValue<Option<SecretString>>,
|
||||
control_plane_emergency_mode: BuilderValue<bool>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -355,6 +359,7 @@ impl Default for PageServerConfigBuilder {
|
||||
|
||||
control_plane_api: Set(None),
|
||||
control_plane_api_token: Set(None),
|
||||
control_plane_emergency_mode: Set(false),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -491,6 +496,10 @@ impl PageServerConfigBuilder {
|
||||
self.control_plane_api_token = BuilderValue::Set(token)
|
||||
}
|
||||
|
||||
pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
|
||||
self.control_plane_emergency_mode = BuilderValue::Set(enabled)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
@@ -582,6 +591,9 @@ impl PageServerConfigBuilder {
|
||||
control_plane_api_token: self
|
||||
.control_plane_api_token
|
||||
.ok_or(anyhow!("missing control_plane_api_token"))?,
|
||||
control_plane_emergency_mode: self
|
||||
.control_plane_emergency_mode
|
||||
.ok_or(anyhow!("missing control_plane_emergency_mode"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -620,11 +632,6 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_id)
|
||||
.join(TENANT_ATTACHING_MARKER_FILENAME)
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
@@ -807,6 +814,10 @@ impl PageServerConf {
|
||||
builder.control_plane_api_token(Some(parsed.into()))
|
||||
}
|
||||
},
|
||||
"control_plane_emergency_mode" => {
|
||||
builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
|
||||
|
||||
},
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -976,6 +987,7 @@ impl PageServerConf {
|
||||
background_task_maximum_delay: Duration::ZERO,
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1199,7 +1211,8 @@ background_task_maximum_delay = '334 s'
|
||||
defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
|
||||
)?,
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1255,7 +1268,8 @@ background_task_maximum_delay = '334 s'
|
||||
ondemand_download_behavior_treat_error_as_warn: false,
|
||||
background_task_maximum_delay: Duration::from_secs(334),
|
||||
control_plane_api: None,
|
||||
control_plane_api_token: None
|
||||
control_plane_api_token: None,
|
||||
control_plane_emergency_mode: false
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! and push them to a HTTP endpoint.
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::tasks::BackgroundLoopKind;
|
||||
use crate::tenant::{mgr, LogicalSizeCalculationCause};
|
||||
use camino::Utf8PathBuf;
|
||||
use consumption_metrics::EventType;
|
||||
@@ -10,6 +11,7 @@ use reqwest::Url;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
@@ -87,22 +89,12 @@ pub async fn collect_metrics(
|
||||
|
||||
let node_id = node_id.to_string();
|
||||
|
||||
// reminder: ticker is ready immediatedly
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
loop {
|
||||
let tick_at = tokio::select! {
|
||||
_ = cancel.cancelled() => return Ok(()),
|
||||
tick_at = ticker.tick() => tick_at,
|
||||
};
|
||||
let started_at = Instant::now();
|
||||
|
||||
// these are point in time, with variable "now"
|
||||
let metrics = metrics::collect_all_metrics(&cached_metrics, &ctx).await;
|
||||
|
||||
if metrics.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let metrics = Arc::new(metrics);
|
||||
|
||||
// why not race cancellation here? because we are one of the last tasks, and if we are
|
||||
@@ -141,10 +133,19 @@ pub async fn collect_metrics(
|
||||
let (_, _) = tokio::join!(flush, upload);
|
||||
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
started_at.elapsed(),
|
||||
metric_collection_interval,
|
||||
"consumption_metrics_collect_metrics",
|
||||
BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
|
||||
);
|
||||
|
||||
let res = tokio::time::timeout_at(
|
||||
started_at + metric_collection_interval,
|
||||
task_mgr::shutdown_token().cancelled(),
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -243,16 +244,14 @@ async fn calculate_synthetic_size_worker(
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("starting calculate_synthetic_size_worker");
|
||||
scopeguard::defer! {
|
||||
info!("calculate_synthetic_size_worker stopped");
|
||||
};
|
||||
|
||||
// reminder: ticker is ready immediatedly
|
||||
let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
|
||||
let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
|
||||
|
||||
loop {
|
||||
let tick_at = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => return Ok(()),
|
||||
tick_at = ticker.tick() => tick_at,
|
||||
};
|
||||
let started_at = Instant::now();
|
||||
|
||||
let tenants = match mgr::list_tenants().await {
|
||||
Ok(tenants) => tenants,
|
||||
@@ -267,7 +266,12 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
|
||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
|
||||
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
|
||||
// We can put in some prioritization for consumption metrics.
|
||||
// Same for the loop that fetches computed metrics.
|
||||
// By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
|
||||
// which turns out is really handy to understand the system.
|
||||
if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
|
||||
error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
|
||||
}
|
||||
@@ -275,9 +279,18 @@ async fn calculate_synthetic_size_worker(
|
||||
}
|
||||
|
||||
crate::tenant::tasks::warn_when_period_overrun(
|
||||
tick_at.elapsed(),
|
||||
started_at.elapsed(),
|
||||
synthetic_size_calculation_interval,
|
||||
"consumption_metrics_synthetic_size_worker",
|
||||
BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
|
||||
);
|
||||
|
||||
let res = tokio::time::timeout_at(
|
||||
started_at + synthetic_size_calculation_interval,
|
||||
task_mgr::shutdown_token().cancelled(),
|
||||
)
|
||||
.await;
|
||||
if res.is_ok() {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,6 @@ use anyhow::Context;
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::EventType;
|
||||
use futures::stream::StreamExt;
|
||||
use serde_with::serde_as;
|
||||
use std::{sync::Arc, time::SystemTime};
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
@@ -42,13 +41,10 @@ pub(super) enum Name {
|
||||
///
|
||||
/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
|
||||
/// elsewhere.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
|
||||
pub(crate) struct MetricsKey {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub(super) tenant_id: TenantId,
|
||||
|
||||
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) timeline_id: Option<TimelineId>,
|
||||
|
||||
@@ -206,7 +202,6 @@ pub(super) async fn collect_all_metrics(
|
||||
None
|
||||
} else {
|
||||
crate::tenant::mgr::get_tenant(id, true)
|
||||
.await
|
||||
.ok()
|
||||
.map(|tenant| (id, tenant))
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
|
||||
use serde_with::serde_as;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
|
||||
@@ -7,12 +6,9 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// How the metrics from pageserver are identified.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
pub(super) tenant_id: TenantId,
|
||||
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) timeline_id: Option<TimelineId>,
|
||||
}
|
||||
|
||||
@@ -57,7 +57,10 @@ impl ControlPlaneClient {
|
||||
|
||||
if let Some(jwt) = &conf.control_plane_api_token {
|
||||
let mut headers = hyper::HeaderMap::new();
|
||||
headers.insert("Authorization", jwt.get_contents().parse().unwrap());
|
||||
headers.insert(
|
||||
"Authorization",
|
||||
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
|
||||
);
|
||||
client = client.default_headers(headers);
|
||||
}
|
||||
|
||||
@@ -133,6 +136,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
node_id: self.node_id,
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-re-attach");
|
||||
|
||||
let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
tracing::info!(
|
||||
"Received re-attach response with {} tenants",
|
||||
@@ -142,7 +147,7 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
Ok(response
|
||||
.tenants
|
||||
.into_iter()
|
||||
.map(|t| (t.id, Generation::new(t.generation)))
|
||||
.map(|t| (t.id, Generation::new(t.gen)))
|
||||
.collect::<HashMap<_, _>>())
|
||||
}
|
||||
|
||||
@@ -168,6 +173,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
|
||||
.collect(),
|
||||
};
|
||||
|
||||
fail::fail_point!("control-plane-client-validate");
|
||||
|
||||
let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;
|
||||
|
||||
Ok(response
|
||||
|
||||
@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::remote_timeline_client::remote_timeline_path;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::Context;
|
||||
use camino::Utf8PathBuf;
|
||||
@@ -17,7 +18,6 @@ use hex::FromHex;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use serde_with::serde_as;
|
||||
use thiserror::Error;
|
||||
use tokio;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -40,7 +40,6 @@ use validator::ValidatorQueueMessage;
|
||||
|
||||
use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};
|
||||
|
||||
// TODO: adminstrative "panic button" config property to disable all deletions
|
||||
// TODO: configurable for how long to wait before executing deletions
|
||||
|
||||
/// We aggregate object deletions from many tenants in one place, for several reasons:
|
||||
@@ -154,7 +153,7 @@ impl FlushOp {
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct DeletionQueueClient {
|
||||
tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
|
||||
tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
|
||||
executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
|
||||
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
|
||||
@@ -186,7 +185,7 @@ where
|
||||
V: Serialize,
|
||||
I: AsRef<[u8]>,
|
||||
{
|
||||
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
|
||||
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
|
||||
|
||||
transformed
|
||||
.collect::<HashMap<String, &V>>()
|
||||
@@ -213,9 +212,8 @@ where
|
||||
|
||||
/// Files ending with this suffix will be ignored and erased
|
||||
/// during recovery as startup.
|
||||
const TEMP_SUFFIX: &str = ".tmp";
|
||||
const TEMP_SUFFIX: &str = "tmp";
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DeletionList {
|
||||
/// Serialization version, for future use
|
||||
@@ -244,7 +242,6 @@ struct DeletionList {
|
||||
validated: bool,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct DeletionHeader {
|
||||
/// Serialization version, for future use
|
||||
@@ -272,7 +269,9 @@ impl DeletionHeader {
|
||||
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
|
||||
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
|
||||
.await
|
||||
.map_err(Into::into)
|
||||
.maybe_fatal_err("save deletion header")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -325,10 +324,7 @@ impl DeletionList {
|
||||
return false;
|
||||
}
|
||||
|
||||
let timeline_entry = tenant_entry
|
||||
.timelines
|
||||
.entry(*timeline)
|
||||
.or_insert_with(Vec::new);
|
||||
let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default();
|
||||
|
||||
let timeline_remote_path = remote_timeline_path(tenant, timeline);
|
||||
|
||||
@@ -364,6 +360,7 @@ impl DeletionList {
|
||||
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
|
||||
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
|
||||
.await
|
||||
.maybe_fatal_err("save deletion list")
|
||||
.map_err(Into::into)
|
||||
}
|
||||
}
|
||||
@@ -420,7 +417,7 @@ pub enum DeletionQueueError {
|
||||
impl DeletionQueueClient {
|
||||
pub(crate) fn broken() -> Self {
|
||||
// Channels whose receivers are immediately dropped.
|
||||
let (tx, _rx) = tokio::sync::mpsc::channel(1);
|
||||
let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
|
||||
Self {
|
||||
tx,
|
||||
@@ -432,12 +429,12 @@ impl DeletionQueueClient {
|
||||
/// This is cancel-safe. If you drop the future before it completes, the message
|
||||
/// is not pushed, although in the context of the deletion queue it doesn't matter: once
|
||||
/// we decide to do a deletion the decision is always final.
|
||||
async fn do_push<T>(
|
||||
fn do_push<T>(
|
||||
&self,
|
||||
queue: &tokio::sync::mpsc::Sender<T>,
|
||||
queue: &tokio::sync::mpsc::UnboundedSender<T>,
|
||||
msg: T,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
match queue.send(msg).await {
|
||||
match queue.send(msg) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => {
|
||||
// This shouldn't happen, we should shut down all tenants before
|
||||
@@ -449,7 +446,7 @@ impl DeletionQueueClient {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn recover(
|
||||
pub(crate) fn recover(
|
||||
&self,
|
||||
attached_tenants: HashMap<TenantId, Generation>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
@@ -457,7 +454,6 @@ impl DeletionQueueClient {
|
||||
&self.tx,
|
||||
ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
|
||||
@@ -530,6 +526,21 @@ impl DeletionQueueClient {
|
||||
return self.flush_immediate().await;
|
||||
}
|
||||
|
||||
self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
|
||||
}
|
||||
|
||||
/// When a Tenant has a generation, push_layers is always synchronous because
|
||||
/// the ListValidator channel is an unbounded channel.
|
||||
///
|
||||
/// This can be merged into push_layers when we remove the Generation-less mode
|
||||
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
|
||||
pub(crate) fn push_layers_sync(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
current_generation: Generation,
|
||||
layers: Vec<(LayerFileName, Generation)>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
metrics::DELETION_QUEUE
|
||||
.keys_submitted
|
||||
.inc_by(layers.len() as u64);
|
||||
@@ -543,17 +554,16 @@ impl DeletionQueueClient {
|
||||
objects: Vec::new(),
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// This is cancel-safe. If you drop the future the flush may still happen in the background.
|
||||
async fn do_flush<T>(
|
||||
&self,
|
||||
queue: &tokio::sync::mpsc::Sender<T>,
|
||||
queue: &tokio::sync::mpsc::UnboundedSender<T>,
|
||||
msg: T,
|
||||
rx: tokio::sync::oneshot::Receiver<()>,
|
||||
) -> Result<(), DeletionQueueError> {
|
||||
self.do_push(queue, msg).await?;
|
||||
self.do_push(queue, msg)?;
|
||||
if rx.await.is_err() {
|
||||
// This shouldn't happen if tenants are shut down before deletion queue. If we
|
||||
// encounter a bug like this, then a flusher will incorrectly believe it has flushed
|
||||
@@ -574,6 +584,18 @@ impl DeletionQueueClient {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Issue a flush without waiting for it to complete. This is useful on advisory flushes where
|
||||
/// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
|
||||
/// detach where flushing is nice but not necessary.
|
||||
///
|
||||
/// This function provides no guarantees of work being done.
|
||||
pub fn flush_advisory(&self) {
|
||||
let (flush_op, _) = FlushOp::new();
|
||||
|
||||
// Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
|
||||
drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
|
||||
}
|
||||
|
||||
// Wait until all previous deletions are executed
|
||||
pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
|
||||
debug!("flush_execute: flushing to deletion lists...");
|
||||
@@ -590,9 +612,7 @@ impl DeletionQueueClient {
|
||||
// Flush any immediate-mode deletions (the above backend flush will only flush
|
||||
// the executor if deletions had flowed through the backend)
|
||||
debug!("flush_execute: flushing execution...");
|
||||
let (flush_op, rx) = FlushOp::new();
|
||||
self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
|
||||
.await?;
|
||||
self.flush_immediate().await?;
|
||||
debug!("flush_execute: finished flushing execution...");
|
||||
Ok(())
|
||||
}
|
||||
@@ -647,8 +667,10 @@ impl DeletionQueue {
|
||||
where
|
||||
C: ControlPlaneGenerationsApi + Send + Sync,
|
||||
{
|
||||
// Deep channel: it consumes deletions from all timelines and we do not want to block them
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(16384);
|
||||
// Unbounded channel: enables non-async functions to submit deletions. The actual length is
|
||||
// constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
|
||||
// enough to avoid this taking pathologically large amount of memory.
|
||||
let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
|
||||
// Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
|
||||
let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
|
||||
@@ -961,7 +983,7 @@ mod test {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new()).await?;
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
|
||||
let tenant_id = ctx.harness.tenant_id;
|
||||
@@ -1029,7 +1051,7 @@ mod test {
|
||||
async fn deletion_queue_validation() -> anyhow::Result<()> {
|
||||
let ctx = setup("deletion_queue_validation").expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new()).await?;
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
// Generation that the control plane thinks is current
|
||||
let latest_generation = Generation::new(0xdeadbeef);
|
||||
@@ -1086,7 +1108,7 @@ mod test {
|
||||
// Basic test that the deletion queue processes the deletions we pass into it
|
||||
let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client.recover(HashMap::new()).await?;
|
||||
client.recover(HashMap::new())?;
|
||||
|
||||
let tenant_id = ctx.harness.tenant_id;
|
||||
|
||||
@@ -1149,9 +1171,7 @@ mod test {
|
||||
drop(client);
|
||||
ctx.restart().await;
|
||||
let client = ctx.deletion_queue.new_client();
|
||||
client
|
||||
.recover(HashMap::from([(tenant_id, now_generation)]))
|
||||
.await?;
|
||||
client.recover(HashMap::from([(tenant_id, now_generation)]))?;
|
||||
|
||||
info!("Flush-executing");
|
||||
client.flush_execute().await?;
|
||||
@@ -1177,7 +1197,7 @@ pub(crate) mod mock {
|
||||
};
|
||||
|
||||
pub struct ConsumerState {
|
||||
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
|
||||
}
|
||||
|
||||
@@ -1254,7 +1274,7 @@ pub(crate) mod mock {
|
||||
}
|
||||
|
||||
pub struct MockDeletionQueue {
|
||||
tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
|
||||
tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
|
||||
executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
|
||||
executed: Arc<AtomicUsize>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
@@ -1264,7 +1284,7 @@ pub(crate) mod mock {
|
||||
|
||||
impl MockDeletionQueue {
|
||||
pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
|
||||
let (tx, rx) = tokio::sync::mpsc::channel(16384);
|
||||
let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
|
||||
let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);
|
||||
|
||||
let executed = Arc::new(AtomicUsize::new(0));
|
||||
@@ -1279,10 +1299,6 @@ pub(crate) mod mock {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_executed(&self) -> usize {
|
||||
self.executed.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
#[allow(clippy::await_holding_lock)]
|
||||
pub async fn pump(&self) {
|
||||
if let Some(remote_storage) = &self.remote_storage {
|
||||
|
||||
@@ -13,6 +13,7 @@ use std::time::Duration;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use utils::backoff;
|
||||
|
||||
use crate::metrics;
|
||||
|
||||
@@ -63,7 +64,19 @@ impl Deleter {
|
||||
Err(anyhow::anyhow!("failpoint hit"))
|
||||
});
|
||||
|
||||
self.remote_storage.delete_objects(&self.accumulator).await
|
||||
// A backoff::retry is used here for two reasons:
|
||||
// - To provide a backoff rather than busy-polling the API on errors
|
||||
// - To absorb transient 429/503 conditions without hitting our error
|
||||
// logging path for issues deleting objects.
|
||||
backoff::retry(
|
||||
|| async { self.remote_storage.delete_objects(&self.accumulator).await },
|
||||
|_| false,
|
||||
3,
|
||||
10,
|
||||
"executing deletion batch",
|
||||
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Block until everything in accumulator has been executed
|
||||
@@ -88,7 +101,10 @@ impl Deleter {
|
||||
self.accumulator.clear();
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("DeleteObjects request failed: {e:#}, will retry");
|
||||
if self.cancel.is_cancelled() {
|
||||
return Err(DeletionQueueError::ShuttingDown);
|
||||
}
|
||||
warn!("DeleteObjects request failed: {e:#}, will continue trying");
|
||||
metrics::DELETION_QUEUE
|
||||
.remote_errors
|
||||
.with_label_values(&["execute"])
|
||||
|
||||
@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
|
||||
use crate::metrics;
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
// The number of keys in a DeletionList before we will proactively persist it
|
||||
// (without reaching a flush deadline). This aims to deliver objects of the order
|
||||
@@ -85,7 +87,7 @@ pub(super) struct ListWriter {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
// Incoming frontend requests to delete some keys
|
||||
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
|
||||
// Outbound requests to the backend to execute deletion lists we have composed.
|
||||
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
|
||||
@@ -111,7 +113,7 @@ impl ListWriter {
|
||||
|
||||
pub(super) fn new(
|
||||
conf: &'static PageServerConf,
|
||||
rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
|
||||
cancel: CancellationToken,
|
||||
) -> Self {
|
||||
@@ -195,7 +197,7 @@ impl ListWriter {
|
||||
debug!("Deletion header {header_path} not found, first start?");
|
||||
Ok(None)
|
||||
} else {
|
||||
Err(anyhow::anyhow!(e))
|
||||
on_fatal_io_error(&e, "reading deletion header");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -216,23 +218,17 @@ impl ListWriter {
|
||||
self.pending.sequence = validated_sequence + 1;
|
||||
|
||||
let deletion_directory = self.conf.deletion_prefix();
|
||||
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
|
||||
Ok(d) => d,
|
||||
Err(e) => {
|
||||
warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
|
||||
|
||||
// Give up: if we can't read the deletion list directory, we probably can't
|
||||
// write lists into it later, so the queue won't work.
|
||||
return Err(e.into());
|
||||
}
|
||||
};
|
||||
let mut dir = tokio::fs::read_dir(&deletion_directory)
|
||||
.await
|
||||
.fatal_err("read deletion directory");
|
||||
|
||||
let list_name_pattern =
|
||||
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
|
||||
|
||||
let temp_extension = format!(".{TEMP_SUFFIX}");
|
||||
let header_path = self.conf.deletion_header_path();
|
||||
let mut seqs: Vec<u64> = Vec::new();
|
||||
while let Some(dentry) = dir.next_entry().await? {
|
||||
while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
|
||||
let file_name = dentry.file_name();
|
||||
let dentry_str = file_name.to_string_lossy();
|
||||
|
||||
@@ -241,15 +237,13 @@ impl ListWriter {
|
||||
continue;
|
||||
}
|
||||
|
||||
if dentry_str.ends_with(TEMP_SUFFIX) {
|
||||
if dentry_str.ends_with(&temp_extension) {
|
||||
info!("Cleaning up temporary file {dentry_str}");
|
||||
let absolute_path =
|
||||
deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
|
||||
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
|
||||
// Non-fatal error: we will just leave the file behind but not
|
||||
// try and load it.
|
||||
warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
|
||||
}
|
||||
tokio::fs::remove_file(&absolute_path)
|
||||
.await
|
||||
.fatal_err("delete temp file");
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -289,7 +283,9 @@ impl ListWriter {
|
||||
for s in seqs {
|
||||
let list_path = self.conf.deletion_list_path(s);
|
||||
|
||||
let list_bytes = tokio::fs::read(&list_path).await?;
|
||||
let list_bytes = tokio::fs::read(&list_path)
|
||||
.await
|
||||
.fatal_err("read deletion list");
|
||||
|
||||
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
|
||||
Ok(l) => l,
|
||||
@@ -348,7 +344,7 @@ impl ListWriter {
|
||||
info!("Started deletion frontend worker");
|
||||
|
||||
// Synchronous, but we only do it once per process lifetime so it's tolerable
|
||||
if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
|
||||
if let Err(e) = create_dir_all(self.conf.deletion_prefix()) {
|
||||
tracing::error!(
|
||||
"Failed to create deletion list directory {}, deletions will not be executed ({e})",
|
||||
self.conf.deletion_prefix(),
|
||||
|
||||
@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
|
||||
use crate::control_plane_client::ControlPlaneGenerationsApi;
|
||||
use crate::control_plane_client::RetryForeverError;
|
||||
use crate::metrics;
|
||||
use crate::virtual_file::MaybeFatalIo;
|
||||
|
||||
use super::deleter::DeleterMessage;
|
||||
use super::DeletionHeader;
|
||||
@@ -220,6 +221,8 @@ where
|
||||
warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
|
||||
metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
|
||||
mutated = true;
|
||||
} else {
|
||||
metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
|
||||
}
|
||||
this_list_valid
|
||||
});
|
||||
@@ -285,16 +288,9 @@ where
|
||||
async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
|
||||
for list_path in list_paths {
|
||||
debug!("Removing deletion list {list_path}");
|
||||
|
||||
if let Err(e) = tokio::fs::remove_file(&list_path).await {
|
||||
// Unexpected: we should have permissions and nothing else should
|
||||
// be touching these files. We will leave the file behind. Subsequent
|
||||
// pageservers will try and load it again: hopefully whatever storage
|
||||
// issue (probably permissions) has been fixed by then.
|
||||
tracing::error!("Failed to delete {list_path}: {e:#}");
|
||||
metrics::DELETION_QUEUE.unexpected_errors.inc();
|
||||
break;
|
||||
}
|
||||
tokio::fs::remove_file(&list_path)
|
||||
.await
|
||||
.fatal_err("remove deletion list");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -60,7 +60,11 @@ use utils::serde_percent::Percent;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{self, storage_layer::PersistentLayer, timeline::EvictionError, Timeline},
|
||||
tenant::{
|
||||
self,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer},
|
||||
Timeline,
|
||||
},
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -108,7 +112,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
_ = background_jobs_barrier.wait() => { }
|
||||
};
|
||||
|
||||
disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
|
||||
disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
@@ -121,7 +125,7 @@ pub fn launch_disk_usage_global_eviction_task(
|
||||
async fn disk_usage_eviction_task(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: GenericRemoteStorage,
|
||||
_storage: &GenericRemoteStorage,
|
||||
tenants_dir: &Utf8Path,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
@@ -145,14 +149,8 @@ async fn disk_usage_eviction_task(
|
||||
let start = Instant::now();
|
||||
|
||||
async {
|
||||
let res = disk_usage_eviction_task_iteration(
|
||||
state,
|
||||
task_config,
|
||||
&storage,
|
||||
tenants_dir,
|
||||
&cancel,
|
||||
)
|
||||
.await;
|
||||
let res =
|
||||
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
|
||||
|
||||
match res {
|
||||
Ok(()) => {}
|
||||
@@ -183,13 +181,12 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
|
||||
async fn disk_usage_eviction_task_iteration(
|
||||
state: &State,
|
||||
task_config: &DiskUsageEvictionTaskConfig,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenants_dir: &Utf8Path,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
|
||||
.context("get filesystem-level disk usage before evictions")?;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
|
||||
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
|
||||
match res {
|
||||
Ok(outcome) => {
|
||||
debug!(?outcome, "disk_usage_eviction_iteration finished");
|
||||
@@ -273,7 +270,6 @@ struct LayerCount {
|
||||
|
||||
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
state: &State,
|
||||
storage: &GenericRemoteStorage,
|
||||
usage_pre: U,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<IterationOutcome<U>> {
|
||||
@@ -330,9 +326,10 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// If we get far enough in the list that we start to evict layers that are below
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
let mut batched: HashMap<_, Vec<Arc<dyn PersistentLayer>>> = HashMap::new();
|
||||
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
|
||||
let mut warned = None;
|
||||
let mut usage_planned = usage_pre;
|
||||
let mut max_batch_size = 0;
|
||||
for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
|
||||
if !usage_planned.has_pressure() {
|
||||
debug!(
|
||||
@@ -349,10 +346,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
|
||||
usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
|
||||
|
||||
batched
|
||||
.entry(TimelineKey(candidate.timeline))
|
||||
.or_default()
|
||||
.push(candidate.layer);
|
||||
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
|
||||
// tasks to evict all seen layers until we have evicted enough
|
||||
|
||||
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
|
||||
|
||||
// semaphore will later be used to limit eviction concurrency, and we can express at
|
||||
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
|
||||
// but fail gracefully by not making batches larger.
|
||||
if batch.len() < u32::MAX as usize {
|
||||
batch.push(candidate.layer);
|
||||
max_batch_size = max_batch_size.max(batch.len());
|
||||
}
|
||||
}
|
||||
|
||||
let usage_planned = match warned {
|
||||
@@ -369,64 +374,101 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
|
||||
// phase2: evict victims batched by timeline
|
||||
|
||||
// After the loop, `usage_assumed` is the post-eviction usage,
|
||||
// according to internal accounting.
|
||||
let mut usage_assumed = usage_pre;
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
let mut js = tokio::task::JoinSet::new();
|
||||
|
||||
// ratelimit to 1k files or any higher max batch size
|
||||
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
|
||||
|
||||
for (timeline, batch) in batched {
|
||||
let tenant_id = timeline.tenant_id;
|
||||
let timeline_id = timeline.timeline_id;
|
||||
let batch_size = batch.len();
|
||||
let batch_size =
|
||||
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
|
||||
|
||||
// I dislike naming of `available_permits` but it means current total amount of permits
|
||||
// because permits can be added
|
||||
assert!(batch_size as usize <= limit.available_permits());
|
||||
|
||||
debug!(%timeline_id, "evicting batch for timeline");
|
||||
|
||||
async {
|
||||
let results = timeline.evict_layers(storage, &batch, cancel.clone()).await;
|
||||
let evict = {
|
||||
let limit = limit.clone();
|
||||
let cancel = cancel.clone();
|
||||
async move {
|
||||
let mut evicted_bytes = 0;
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
|
||||
match results {
|
||||
Err(e) => {
|
||||
warn!("failed to evict batch: {:#}", e);
|
||||
}
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
match result {
|
||||
Some(Ok(())) => {
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
}
|
||||
Some(Err(EvictionError::CannotEvictRemoteLayer)) => {
|
||||
unreachable!("get_local_layers_for_disk_usage_eviction finds only local layers")
|
||||
}
|
||||
Some(Err(EvictionError::FileNotFound)) => {
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
Some(Err(
|
||||
e @ EvictionError::LayerNotFound(_)
|
||||
| e @ EvictionError::StatFailed(_),
|
||||
)) => {
|
||||
let e = utils::error::report_compact_sources(&e);
|
||||
warn!(%layer, "failed to evict layer: {e}");
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
None => {
|
||||
assert!(cancel.is_cancelled());
|
||||
return;
|
||||
let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
|
||||
// semaphore closing means cancelled
|
||||
return (evicted_bytes, evictions_failed);
|
||||
};
|
||||
|
||||
let results = timeline.evict_layers(&batch).await;
|
||||
|
||||
match results {
|
||||
Ok(results) => {
|
||||
assert_eq!(results.len(), batch.len());
|
||||
for (result, layer) in results.into_iter().zip(batch.iter()) {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
match result {
|
||||
Some(Ok(())) => {
|
||||
evicted_bytes += file_size;
|
||||
}
|
||||
Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
None => {
|
||||
assert!(cancel.is_cancelled());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to evict batch: {:#}", e);
|
||||
}
|
||||
}
|
||||
(evicted_bytes, evictions_failed)
|
||||
}
|
||||
}
|
||||
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size))
|
||||
.await;
|
||||
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
js.spawn(evict);
|
||||
|
||||
// spwaning multiple thousands of these is essentially blocking, so give already spawned a
|
||||
// chance of making progress
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
|
||||
let join_all = async move {
|
||||
// After the evictions, `usage_assumed` is the post-eviction usage,
|
||||
// according to internal accounting.
|
||||
let mut usage_assumed = usage_pre;
|
||||
let mut evictions_failed = LayerCount::default();
|
||||
|
||||
while let Some(res) = js.join_next().await {
|
||||
match res {
|
||||
Ok((evicted_bytes, failed)) => {
|
||||
usage_assumed.add_available_bytes(evicted_bytes);
|
||||
evictions_failed.file_sizes += failed.file_sizes;
|
||||
evictions_failed.count += failed.count;
|
||||
}
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => { /* already logged */ }
|
||||
Err(je) => tracing::error!("unknown JoinError: {je:?}"),
|
||||
}
|
||||
}
|
||||
(usage_assumed, evictions_failed)
|
||||
};
|
||||
|
||||
let (usage_assumed, evictions_failed) = tokio::select! {
|
||||
tuple = join_all => { tuple },
|
||||
_ = cancel.cancelled() => {
|
||||
// close the semaphore to stop any pending acquires
|
||||
limit.close();
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(IterationOutcome::Finished(IterationOutcomeFinished {
|
||||
before: usage_pre,
|
||||
@@ -441,7 +483,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
#[derive(Clone)]
|
||||
struct EvictionCandidate {
|
||||
timeline: Arc<Timeline>,
|
||||
layer: Arc<dyn PersistentLayer>,
|
||||
layer: Layer,
|
||||
last_activity_ts: SystemTime,
|
||||
}
|
||||
|
||||
@@ -503,7 +545,7 @@ async fn collect_eviction_candidates(
|
||||
if cancel.is_cancelled() {
|
||||
return Ok(EvictionCandidates::Cancelled);
|
||||
}
|
||||
let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
|
||||
let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e) => {
|
||||
// this can happen if tenant has lifecycle transition after we fetched it
|
||||
@@ -512,6 +554,11 @@ async fn collect_eviction_candidates(
|
||||
}
|
||||
};
|
||||
|
||||
if tenant.cancel.is_cancelled() {
|
||||
info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
|
||||
continue;
|
||||
}
|
||||
|
||||
// collect layers from all timelines in this tenant
|
||||
//
|
||||
// If one of the timelines becomes `!is_active()` during the iteration,
|
||||
|
||||
@@ -93,9 +93,16 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"400":
|
||||
@@ -134,6 +141,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
@@ -178,6 +192,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
parameters:
|
||||
@@ -226,6 +247,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
|
||||
responses:
|
||||
@@ -265,7 +293,74 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: Get timestamp for a given LSN
|
||||
parameters:
|
||||
- name: lsn
|
||||
in: query
|
||||
required: true
|
||||
schema:
|
||||
type: integer
|
||||
description: A LSN to get the timestamp
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found, or there is no timestamp information for the given lsn
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
@@ -297,13 +392,19 @@ paths:
|
||||
type: string
|
||||
format: date-time
|
||||
description: A timestamp to get the LSN
|
||||
- name: version
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
description: The version of the endpoint to use
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
@@ -328,6 +429,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -375,6 +483,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/attach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -454,7 +569,17 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Tenant download is already in progress
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
and hence this `/attach` call has been rejected.
|
||||
|
||||
Some examples of how this can happen:
|
||||
- tenant was created on this pageserver
|
||||
- tenant attachment was started by an earlier call to `/attach`.
|
||||
|
||||
Callers should poll the tenant status's `attachment_status` field,
|
||||
like for status 202. See the longer description for `POST /attach`
|
||||
for details.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -465,6 +590,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/detach:
|
||||
parameters:
|
||||
@@ -518,6 +650,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
@@ -560,6 +699,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
parameters:
|
||||
@@ -577,6 +723,12 @@ paths:
|
||||
|
||||
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
|
||||
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLoadRequest"
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
@@ -604,6 +756,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
@@ -641,6 +800,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
@@ -704,6 +869,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -780,6 +952,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/:
|
||||
get:
|
||||
description: Get tenants list
|
||||
@@ -810,6 +989,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
post:
|
||||
description: |
|
||||
Create a tenant. Returns new tenant id on success.
|
||||
@@ -860,6 +1046,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
@@ -905,6 +1098,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -954,6 +1154,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
@@ -1012,6 +1219,15 @@ components:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantLoadRequest:
|
||||
type: object
|
||||
properties:
|
||||
generation:
|
||||
type: integer
|
||||
description: Attachment generation number.
|
||||
TenantAttachRequest:
|
||||
type: object
|
||||
required:
|
||||
@@ -1199,6 +1415,19 @@ components:
|
||||
type: string
|
||||
format: hex
|
||||
|
||||
LsnByTimestampResponse:
|
||||
type: object
|
||||
required:
|
||||
- lsn
|
||||
- kind
|
||||
properties:
|
||||
lsn:
|
||||
type: string
|
||||
format: hex
|
||||
kind:
|
||||
type: string
|
||||
enum: [past, present, future, nodata]
|
||||
|
||||
Error:
|
||||
type: object
|
||||
required:
|
||||
@@ -1220,6 +1449,13 @@ components:
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
ServiceUnavailableError:
|
||||
type: object
|
||||
required:
|
||||
- msg
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
NotFoundError:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -2,10 +2,13 @@
|
||||
//! Management HTTP API
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use futures::TryFutureExt;
|
||||
use humantime::format_rfc3339;
|
||||
use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
@@ -32,7 +35,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::{LocationConf, TenantConfOpt};
|
||||
use crate::tenant::mgr::{
|
||||
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
|
||||
GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
|
||||
TenantSlotUpsertError, TenantStateError,
|
||||
};
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
@@ -76,7 +80,7 @@ impl State {
|
||||
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
@@ -133,11 +137,9 @@ impl From<PageReconstructError> for ApiError {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
PageReconstructError::AncestorStopping(_) => {
|
||||
ApiError::ResourceUnavailable(format!("{pre}"))
|
||||
}
|
||||
PageReconstructError::WalRedo(pre) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(pre))
|
||||
ApiError::ResourceUnavailable(format!("{pre}").into())
|
||||
}
|
||||
PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -145,31 +147,59 @@ impl From<PageReconstructError> for ApiError {
|
||||
impl From<TenantMapInsertError> for ApiError {
|
||||
fn from(tmie: TenantMapInsertError) -> ApiError {
|
||||
match tmie {
|
||||
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
|
||||
ApiError::ResourceUnavailable(format!("{tmie}"))
|
||||
}
|
||||
TenantMapInsertError::TenantAlreadyExists(id, state) => {
|
||||
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
|
||||
}
|
||||
TenantMapInsertError::TenantExistsSecondary(id) => {
|
||||
ApiError::Conflict(format!("tenant {id} already exists as secondary"))
|
||||
}
|
||||
TenantMapInsertError::SlotError(e) => e.into(),
|
||||
TenantMapInsertError::SlotUpsertError(e) => e.into(),
|
||||
TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantSlotError> for ApiError {
|
||||
fn from(e: TenantSlotError) -> ApiError {
|
||||
use TenantSlotError::*;
|
||||
match e {
|
||||
NotFound(tenant_id) => {
|
||||
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||
}
|
||||
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
|
||||
InProgress => {
|
||||
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||
}
|
||||
MapState(e) => e.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantSlotUpsertError> for ApiError {
|
||||
fn from(e: TenantSlotUpsertError) -> ApiError {
|
||||
use TenantSlotUpsertError::*;
|
||||
match e {
|
||||
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
|
||||
MapState(e) => e.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantMapError> for ApiError {
|
||||
fn from(e: TenantMapError) -> ApiError {
|
||||
use TenantMapError::*;
|
||||
match e {
|
||||
StillInitializing | ShuttingDown => {
|
||||
ApiError::ResourceUnavailable(format!("{e}").into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TenantStateError> for ApiError {
|
||||
fn from(tse: TenantStateError) -> ApiError {
|
||||
match tse {
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
TenantStateError::NotActive(_) => {
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
TenantStateError::IsStopping(_) => {
|
||||
ApiError::ResourceUnavailable("Tenant is stopping".into())
|
||||
}
|
||||
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
|
||||
TenantStateError::SlotError(e) => e.into(),
|
||||
TenantStateError::SlotUpsertError(e) => e.into(),
|
||||
TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -190,6 +220,7 @@ impl From<GetTenantError> for ApiError {
|
||||
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
|
||||
ApiError::ResourceUnavailable("Tenant not yet active".into())
|
||||
}
|
||||
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -244,6 +275,9 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
Get(g) => ApiError::from(g),
|
||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
|
||||
SlotError(e) => e.into(),
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
}
|
||||
@@ -370,7 +404,7 @@ async fn timeline_create_handler(
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
@@ -395,6 +429,12 @@ async fn timeline_create_handler(
|
||||
format!("{err:#}")
|
||||
))
|
||||
}
|
||||
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::ShuttingDown) => {
|
||||
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
|
||||
}
|
||||
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
@@ -414,7 +454,7 @@ async fn timeline_list_handler(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let response_data = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
let timelines = tenant.list_timelines();
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
@@ -453,7 +493,7 @@ async fn timeline_detail_handler(
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
|
||||
let timeline_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
@@ -483,6 +523,8 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let version: Option<u8> = parse_query_param(&request, "version")?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let timestamp_raw = must_get_query_param(&request, "timestamp")?;
|
||||
let timestamp = humantime::parse_rfc3339(×tamp_raw)
|
||||
@@ -494,13 +536,57 @@ async fn get_lsn_by_timestamp_handler(
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
|
||||
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
json_response(StatusCode::OK, result)
|
||||
if version.unwrap_or(0) > 1 {
|
||||
#[derive(serde::Serialize)]
|
||||
struct Result {
|
||||
lsn: Lsn,
|
||||
kind: &'static str,
|
||||
}
|
||||
let (lsn, kind) = match result {
|
||||
LsnForTimestamp::Present(lsn) => (lsn, "present"),
|
||||
LsnForTimestamp::Future(lsn) => (lsn, "future"),
|
||||
LsnForTimestamp::Past(lsn) => (lsn, "past"),
|
||||
LsnForTimestamp::NoData(lsn) => (lsn, "nodata"),
|
||||
};
|
||||
json_response(StatusCode::OK, Result { lsn, kind })
|
||||
} else {
|
||||
// FIXME: this is a temporary crutch not to break backwards compatibility
|
||||
// See https://github.com/neondatabase/neon/pull/5608
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_timestamp_of_lsn_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let lsn_str = must_get_query_param(&request, "lsn")?;
|
||||
let lsn = Lsn::from_str(&lsn_str)
|
||||
.with_context(|| format!("Invalid LSN: {lsn_str:?}"))
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
|
||||
|
||||
match result {
|
||||
Some(time) => {
|
||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => json_response(StatusCode::NOT_FOUND, ()),
|
||||
}
|
||||
}
|
||||
|
||||
async fn tenant_attach_handler(
|
||||
@@ -571,9 +657,14 @@ async fn tenant_detach_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
|
||||
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||
.await?;
|
||||
mgr::detach_tenant(
|
||||
conf,
|
||||
tenant_id,
|
||||
detach_ignored.unwrap_or(false),
|
||||
&state.deletion_queue_client,
|
||||
)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -636,7 +727,7 @@ async fn tenant_list_handler(
|
||||
.instrument(info_span!("tenant_list"))
|
||||
.await
|
||||
.map_err(|_| {
|
||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".to_string())
|
||||
ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
|
||||
})?
|
||||
.iter()
|
||||
.map(|(id, state)| TenantInfo {
|
||||
@@ -658,7 +749,7 @@ async fn tenant_status(
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant_info = async {
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, false)?;
|
||||
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
@@ -721,7 +812,7 @@ async fn tenant_size_handler(
|
||||
let headers = request.headers();
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
|
||||
// this can be long operation
|
||||
let inputs = tenant
|
||||
@@ -734,6 +825,10 @@ async fn tenant_size_handler(
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut sizes = None;
|
||||
let accepts_html = headers
|
||||
.get(header::ACCEPT)
|
||||
.map(|v| v == "text/html")
|
||||
.unwrap_or_default();
|
||||
if !inputs_only.unwrap_or(false) {
|
||||
let storage_model = inputs
|
||||
.calculate_model()
|
||||
@@ -741,21 +836,19 @@ async fn tenant_size_handler(
|
||||
let size = storage_model.calculate();
|
||||
|
||||
// If request header expects html, return html
|
||||
if headers["Accept"] == "text/html" {
|
||||
if accepts_html {
|
||||
return synthetic_size_html_response(inputs, storage_model, size);
|
||||
}
|
||||
sizes = Some(size);
|
||||
} else if headers["Accept"] == "text/html" {
|
||||
} else if accepts_html {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"inputs_only parameter is incompatible with html output request"
|
||||
)));
|
||||
}
|
||||
|
||||
/// The type resides in the pageserver not to expose `ModelInputs`.
|
||||
#[serde_with::serde_as]
|
||||
#[derive(serde::Serialize)]
|
||||
struct TenantHistorySize {
|
||||
#[serde_as(as = "serde_with::DisplayFromStr")]
|
||||
id: TenantId,
|
||||
/// Size is a mixture of WAL and logical size, so the unit is bytes.
|
||||
///
|
||||
@@ -896,7 +989,7 @@ fn synthetic_size_html_response(
|
||||
pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
|
||||
let response = Response::builder()
|
||||
.status(status)
|
||||
.header(hyper::header::CONTENT_TYPE, "text/html")
|
||||
.header(header::CONTENT_TYPE, "text/html")
|
||||
.body(Body::from(data.as_bytes().to_vec()))
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
@@ -976,7 +1069,7 @@ async fn get_tenant_config_handler(
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, false).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, false)?;
|
||||
|
||||
let response = HashMap::from([
|
||||
(
|
||||
@@ -1030,9 +1123,17 @@ async fn put_tenant_location_config_handler(
|
||||
// The `Detached` state is special, it doesn't upsert a tenant, it removes
|
||||
// its local disk content and drops it from memory.
|
||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||
mgr::detach_tenant(conf, tenant_id, true)
|
||||
if let Err(e) = mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||
.await?;
|
||||
.await
|
||||
{
|
||||
match e {
|
||||
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
|
||||
// This API is idempotent: a NotFound on a detach is fine.
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
return json_response(StatusCode::OK, ());
|
||||
}
|
||||
|
||||
@@ -1065,7 +1166,6 @@ async fn handle_tenant_break(
|
||||
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
|
||||
|
||||
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
|
||||
.await
|
||||
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
|
||||
|
||||
tenant.set_broken("broken from test".to_owned()).await;
|
||||
@@ -1138,7 +1238,7 @@ async fn timeline_compact_handler(
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
.instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
|
||||
@@ -1163,7 +1263,7 @@ async fn timeline_checkpoint_handler(
|
||||
timeline
|
||||
.compact(&cancel, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -1236,11 +1336,141 @@ async fn deletion_queue_flush(
|
||||
}
|
||||
}
|
||||
|
||||
/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
|
||||
async fn getpage_at_lsn_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
struct Key(crate::repository::Key);
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
crate::repository::Key::from_hex(s).map(Key)
|
||||
}
|
||||
}
|
||||
|
||||
let key: Key = parse_query_param(&request, "key")?
|
||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
|
||||
let lsn: Lsn = parse_query_param(&request, "lsn")?
|
||||
.ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
|
||||
let page = timeline.get(key.0, lsn, &ctx).await?;
|
||||
|
||||
Result::<_, ApiError>::Ok(
|
||||
Response::builder()
|
||||
.status(StatusCode::OK)
|
||||
.header(header::CONTENT_TYPE, "application/octet-stream")
|
||||
.body(hyper::Body::from(page))
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
.instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn timeline_collect_keyspace(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
struct Partitioning {
|
||||
keys: crate::keyspace::KeySpace,
|
||||
|
||||
at_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl serde::Serialize for Partitioning {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeMap;
|
||||
let mut map = serializer.serialize_map(Some(2))?;
|
||||
map.serialize_key("keys")?;
|
||||
map.serialize_value(&KeySpace(&self.keys))?;
|
||||
map.serialize_key("at_lsn")?;
|
||||
map.serialize_value(&WithDisplay(&self.at_lsn))?;
|
||||
map.end()
|
||||
}
|
||||
}
|
||||
|
||||
struct WithDisplay<'a, T>(&'a T);
|
||||
|
||||
impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
serializer.collect_str(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
|
||||
|
||||
impl<'a> serde::Serialize for KeySpace<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeSeq;
|
||||
let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
|
||||
for kr in &self.0.ranges {
|
||||
seq.serialize_element(&KeyRange(kr))?;
|
||||
}
|
||||
seq.end()
|
||||
}
|
||||
}
|
||||
|
||||
struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
|
||||
|
||||
impl<'a> serde::Serialize for KeyRange<'a> {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
use serde::ser::SerializeTuple;
|
||||
let mut t = serializer.serialize_tuple(2)?;
|
||||
t.serialize_element(&WithDisplay(&self.0.start))?;
|
||||
t.serialize_element(&WithDisplay(&self.0.end))?;
|
||||
t.end()
|
||||
}
|
||||
}
|
||||
|
||||
let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
|
||||
let keys = timeline
|
||||
.collect_keyspace(at_lsn, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
|
||||
}
|
||||
.instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
|
||||
.await
|
||||
}
|
||||
|
||||
async fn active_timeline_of_active_tenant(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, ApiError> {
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
let tenant = mgr::get_tenant(tenant_id, true)?;
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
@@ -1303,11 +1533,11 @@ async fn disk_usage_eviction_run(
|
||||
|
||||
let state = get_state(&r);
|
||||
|
||||
let Some(storage) = state.remote_storage.clone() else {
|
||||
if state.remote_storage.as_ref().is_none() {
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"remote storage not configured, cannot run eviction iteration"
|
||||
)));
|
||||
};
|
||||
}
|
||||
|
||||
let state = state.disk_usage_eviction_state.clone();
|
||||
|
||||
@@ -1325,7 +1555,6 @@ async fn disk_usage_eviction_run(
|
||||
async move {
|
||||
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
|
||||
&state,
|
||||
&storage,
|
||||
usage,
|
||||
&child_cancel,
|
||||
)
|
||||
@@ -1538,6 +1767,10 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
|
||||
|r| api_handler(r, get_lsn_by_timestamp_handler),
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||
)
|
||||
.put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
|
||||
api_handler(r, timeline_gc_handler)
|
||||
})
|
||||
@@ -1583,5 +1816,12 @@ pub fn make_router(
|
||||
.post("/v1/tracing/event", |r| {
|
||||
testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
|
||||
testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
|
||||
|r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
|
||||
)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -61,14 +61,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down any page service tasks.
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down all the tenants. This flushes everything to disk and kills
|
||||
// the checkpoint and GC tasks.
|
||||
timed(
|
||||
@@ -78,6 +70,15 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||
// should already have been canclled via mgr::shutdown_all_tenants
|
||||
timed(
|
||||
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
|
||||
"shutdown PageRequestHandlers",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
if let Some(mut deletion_queue) = deletion_queue {
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
@@ -149,6 +150,10 @@ fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: DO NOT ADD new query methods like this, which will have a next step of parsing timelineid
|
||||
// from the directory name. Instead create type "UninitMark(TimelineId)" and only parse it once
|
||||
// from the name.
|
||||
|
||||
pub fn is_uninit_mark(path: &Utf8Path) -> bool {
|
||||
ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
|
||||
}
|
||||
@@ -173,6 +178,9 @@ fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
|
||||
/// delaying is needed.
|
||||
#[derive(Clone)]
|
||||
pub struct InitializationOrder {
|
||||
/// Each initial tenant load task carries this until it is done loading timelines from remote storage
|
||||
pub initial_tenant_load_remote: Option<utils::completion::Completion>,
|
||||
|
||||
/// Each initial tenant load task carries this until completion.
|
||||
pub initial_tenant_load: Option<utils::completion::Completion>,
|
||||
|
||||
|
||||
@@ -691,10 +691,9 @@ impl StorageIoTime {
|
||||
.expect("failed to define a metric");
|
||||
let metrics = std::array::from_fn(|i| {
|
||||
let op = StorageIoOperation::from_repr(i).unwrap();
|
||||
let metric = storage_io_histogram_vec
|
||||
storage_io_histogram_vec
|
||||
.get_metric_with_label_values(&[op.as_str()])
|
||||
.unwrap();
|
||||
metric
|
||||
.unwrap()
|
||||
});
|
||||
Self { metrics }
|
||||
}
|
||||
@@ -963,10 +962,37 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct TenantManagerMetrics {
|
||||
pub(crate) tenant_slots: UIntGauge,
|
||||
pub(crate) tenant_slot_writes: IntCounter,
|
||||
pub(crate) unexpected_errors: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
|
||||
TenantManagerMetrics {
|
||||
tenant_slots: register_uint_gauge!(
|
||||
"pageserver_tenant_manager_slots",
|
||||
"How many slots currently exist, including all attached, secondary and in-progress operations",
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
tenant_slot_writes: register_int_counter!(
|
||||
"pageserver_tenant_manager_slot_writes",
|
||||
"Writes to a tenant slot, including all of create/attach/detach/delete"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
unexpected_errors: register_int_counter!(
|
||||
"pageserver_tenant_manager_unexpected_errors_total",
|
||||
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) struct DeletionQueueMetrics {
|
||||
pub(crate) keys_submitted: IntCounter,
|
||||
pub(crate) keys_dropped: IntCounter,
|
||||
pub(crate) keys_executed: IntCounter,
|
||||
pub(crate) keys_validated: IntCounter,
|
||||
pub(crate) dropped_lsn_updates: IntCounter,
|
||||
pub(crate) unexpected_errors: IntCounter,
|
||||
pub(crate) remote_errors: IntCounterVec,
|
||||
@@ -988,7 +1014,13 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
|
||||
|
||||
keys_executed: register_int_counter!(
|
||||
"pageserver_deletion_queue_executed_total",
|
||||
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
|
||||
"Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
keys_validated: register_int_counter!(
|
||||
"pageserver_deletion_queue_validated_total",
|
||||
"Number of keys validated for deletion. Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
|
||||
@@ -1061,6 +1093,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
|
||||
Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_start_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls started",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
|
||||
Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_semaphore_wait_finish_count",
|
||||
"Counter for background loop concurrency-limiting semaphore acquire calls finished",
|
||||
&["task"],
|
||||
)
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_background_loop_period_overrun_count",
|
||||
@@ -1362,28 +1414,23 @@ impl TimelineMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record_new_file_metrics(&self, sz: u64) {
|
||||
pub(crate) fn record_new_file_metrics(&self, sz: u64) {
|
||||
self.resident_physical_size_add(sz);
|
||||
self.num_persistent_files_created.inc_by(1);
|
||||
self.persistent_bytes_written.inc_by(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_sub(&self, sz: u64) {
|
||||
pub(crate) fn resident_physical_size_sub(&self, sz: u64) {
|
||||
self.resident_physical_size_gauge.sub(sz);
|
||||
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_add(&self, sz: u64) {
|
||||
pub(crate) fn resident_physical_size_add(&self, sz: u64) {
|
||||
self.resident_physical_size_gauge.add(sz);
|
||||
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_set(&self, sz: u64) {
|
||||
self.resident_physical_size_gauge.set(sz);
|
||||
crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
|
||||
}
|
||||
|
||||
pub fn resident_physical_size_get(&self) -> u64 {
|
||||
pub(crate) fn resident_physical_size_get(&self) -> u64 {
|
||||
self.resident_physical_size_gauge.get()
|
||||
}
|
||||
}
|
||||
@@ -1863,6 +1910,9 @@ pub fn preinitialize_metrics() {
|
||||
// Deletion queue stats
|
||||
Lazy::force(&DELETION_QUEUE);
|
||||
|
||||
// Tenant manager stats
|
||||
Lazy::force(&TENANT_MANAGER);
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
.into_iter()
|
||||
|
||||
@@ -66,8 +66,7 @@
|
||||
//! inserted to the mapping, but you must hold the write-lock on the slot until
|
||||
//! the contents are valid. If you need to release the lock without initializing
|
||||
//! the contents, you must remove the mapping first. We make that easy for the
|
||||
//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
|
||||
//! page, the caller must explicitly call guard.mark_valid() after it has
|
||||
//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
|
||||
//! initialized it. If the guard is dropped without calling mark_valid(), the
|
||||
//! mapping is automatically removed and the slot is marked free.
|
||||
//!
|
||||
@@ -286,23 +285,25 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
|
||||
///
|
||||
/// Counterintuitively, this is used even for a read, if the requested page is not
|
||||
/// currently found in the page cache. In that case, the caller of lock_for_read()
|
||||
/// is expected to fill in the page contents and call mark_valid(). Similarly
|
||||
/// lock_for_write() can return an invalid buffer that the caller is expected to
|
||||
/// to initialize.
|
||||
///
|
||||
/// is expected to fill in the page contents and call mark_valid().
|
||||
pub struct PageWriteGuard<'i> {
|
||||
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
|
||||
state: PageWriteGuardState<'i>,
|
||||
}
|
||||
|
||||
_permit: PinnedSlotsPermit,
|
||||
|
||||
// Are the page contents currently valid?
|
||||
// Used to mark pages as invalid that are assigned but not yet filled with data.
|
||||
valid: bool,
|
||||
enum PageWriteGuardState<'i> {
|
||||
Invalid {
|
||||
inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
|
||||
_permit: PinnedSlotsPermit,
|
||||
},
|
||||
Downgraded,
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for PageWriteGuard<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.inner.buf
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -310,25 +311,28 @@ impl std::ops::Deref for PageWriteGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.inner.buf
|
||||
match &self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
|
||||
fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
|
||||
self.inner.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl PageWriteGuard<'_> {
|
||||
impl<'a> PageWriteGuard<'a> {
|
||||
/// Mark that the buffer contents are now valid.
|
||||
pub fn mark_valid(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
assert!(
|
||||
!self.valid,
|
||||
"mark_valid called on a buffer that was already valid"
|
||||
);
|
||||
self.valid = true;
|
||||
#[must_use]
|
||||
pub fn mark_valid(mut self) -> PageReadGuard<'a> {
|
||||
let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
|
||||
match prev {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => {
|
||||
assert!(inner.key.is_some());
|
||||
PageReadGuard {
|
||||
_permit: Arc::new(_permit),
|
||||
slot_guard: inner.downgrade(),
|
||||
}
|
||||
}
|
||||
PageWriteGuardState::Downgraded => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -339,11 +343,14 @@ impl Drop for PageWriteGuard<'_> {
|
||||
/// initializing it, remove the mapping from the page cache.
|
||||
///
|
||||
fn drop(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
if !self.valid {
|
||||
let self_key = self.inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
self.inner.key = None;
|
||||
match &mut self.state {
|
||||
PageWriteGuardState::Invalid { inner, _permit } => {
|
||||
assert!(inner.key.is_some());
|
||||
let self_key = inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
inner.key = None;
|
||||
}
|
||||
PageWriteGuardState::Downgraded => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -354,12 +361,6 @@ pub enum ReadBufResult<'a> {
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
/// lock_for_write() return value
|
||||
pub enum WriteBufResult<'a> {
|
||||
Found(PageWriteGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
impl PageCache {
|
||||
//
|
||||
// Section 1.1: Public interface functions for looking up and memorizing materialized page
|
||||
@@ -446,20 +447,77 @@ impl PageCache {
|
||||
lsn,
|
||||
};
|
||||
|
||||
match self.lock_for_write(&cache_key).await? {
|
||||
WriteBufResult::Found(write_guard) => {
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(*write_guard == img);
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
if inner.key.as_ref() == Some(&cache_key) {
|
||||
slot.inc_usage_count();
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
debug_assert_eq!(inner.buf.len(), img.len());
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(inner.buf == img);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
WriteBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(img);
|
||||
write_guard.mark_valid();
|
||||
}
|
||||
}
|
||||
debug_assert!(permit.is_some());
|
||||
|
||||
Ok(())
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self
|
||||
.find_victim(permit.as_ref().unwrap())
|
||||
.await
|
||||
.context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
// Create a write guard for the slot so we go through the expected motions.
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
let mut write_guard = PageWriteGuard {
|
||||
state: PageWriteGuardState::Invalid {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
},
|
||||
};
|
||||
write_guard.copy_from_slice(img);
|
||||
let _ = write_guard.mark_valid();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with immutable file pages.
|
||||
@@ -638,99 +696,10 @@ impl PageCache {
|
||||
);
|
||||
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: false,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up a page in the cache and lock it in write mode. If it's not
|
||||
/// found, returns None.
|
||||
///
|
||||
/// When locking a page for writing, the search criteria is always "exact".
|
||||
async fn try_lock_for_write(
|
||||
&self,
|
||||
cache_key: &CacheKey,
|
||||
permit: &mut Option<PinnedSlotsPermit>,
|
||||
) -> Option<PageWriteGuard> {
|
||||
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
return Some(PageWriteGuard {
|
||||
state: PageWriteGuardState::Invalid {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Return a write-locked buffer for given block.
|
||||
///
|
||||
/// Similar to lock_for_read(), but the returned buffer is write-locked and
|
||||
/// may be modified by the caller even if it's already found in the cache.
|
||||
async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key, &mut permit).await {
|
||||
debug_assert!(permit.is_none());
|
||||
return Ok(WriteBufResult::Found(write_guard));
|
||||
}
|
||||
debug_assert!(permit.is_some());
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self
|
||||
.find_victim(permit.as_ref().unwrap())
|
||||
.await
|
||||
.context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
valid: false,
|
||||
}));
|
||||
}
|
||||
}
|
||||
@@ -775,7 +744,7 @@ impl PageCache {
|
||||
///
|
||||
/// Like 'search_mapping, but performs an "exact" search. Used for
|
||||
/// allocating a new buffer.
|
||||
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
||||
fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
|
||||
match key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
|
||||
@@ -14,7 +14,6 @@ use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use bytes::Bytes;
|
||||
use futures::Stream;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
|
||||
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
|
||||
@@ -35,6 +34,7 @@ use std::time::Duration;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::field;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
@@ -54,78 +54,19 @@ use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::GetTenantError;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::trace::Tracer;
|
||||
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Unpin,
|
||||
{
|
||||
async_stream::try_stream! {
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
// We were requested to shut down.
|
||||
let msg = "pageserver is shutting down";
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||
Err(QueryError::Other(anyhow::anyhow!(msg)))
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
|
||||
};
|
||||
|
||||
match msg {
|
||||
Ok(Some(message)) => {
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
FeMessage::CopyDone => { break },
|
||||
FeMessage::Sync => continue,
|
||||
FeMessage::Terminate => {
|
||||
let msg = "client terminated connection with Terminate message during COPY";
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
break;
|
||||
}
|
||||
m => {
|
||||
let msg = format!("unexpected message {m:?}");
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
yield copy_data_bytes;
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection during COPY";
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
pgb.flush().await?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||
Err(io_error)?;
|
||||
}
|
||||
Err(other) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
|
||||
// is not yet in state [`TenantState::Active`].
|
||||
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
/// Read the end of a tar archive.
|
||||
///
|
||||
@@ -184,6 +125,7 @@ pub async fn libpq_listener_main(
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
listener_ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<()> {
|
||||
listener.set_nonblocking(true)?;
|
||||
let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
|
||||
@@ -192,7 +134,7 @@ pub async fn libpq_listener_main(
|
||||
while let Some(res) = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
None
|
||||
}
|
||||
@@ -336,6 +278,95 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
|
||||
/// this rather than naked flush() in order to shut down promptly. Without this, we would
|
||||
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
|
||||
/// in the flush.
|
||||
async fn flush_cancellable<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
tokio::select!(
|
||||
flush_r = pgb.flush() => {
|
||||
Ok(flush_r?)
|
||||
},
|
||||
_ = cancel.cancelled() => {
|
||||
Err(QueryError::Shutdown)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
fn copyin_stream<'a, IO>(
|
||||
&'a self,
|
||||
pgb: &'a mut PostgresBackend<IO>,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> impl Stream<Item = io::Result<Bytes>> + 'a
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
async_stream::try_stream! {
|
||||
loop {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
let msg = "pageserver is shutting down";
|
||||
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
|
||||
Err(QueryError::Shutdown)
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg.map_err(QueryError::from)}
|
||||
};
|
||||
|
||||
match msg {
|
||||
Ok(Some(message)) => {
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
FeMessage::CopyDone => { break },
|
||||
FeMessage::Sync => continue,
|
||||
FeMessage::Terminate => {
|
||||
let msg = "client terminated connection with Terminate message during COPY";
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
break;
|
||||
}
|
||||
m => {
|
||||
let msg = format!("unexpected message {m:?}");
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))?;
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
yield copy_data_bytes;
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection during COPY";
|
||||
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
|
||||
// error can't happen here, ErrorResponse serialization should be always ok
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
|
||||
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
||||
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
|
||||
}
|
||||
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
|
||||
Err(io_error)?;
|
||||
}
|
||||
Err(other) => {
|
||||
Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all)]
|
||||
async fn handle_pagerequests<IO>(
|
||||
&self,
|
||||
@@ -349,12 +380,13 @@ impl PageServerHandler {
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// NOTE: pagerequests handler exits when connection is closed,
|
||||
// so there is no need to reset the association
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
// Make request tracer if needed
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
let tenant = mgr::get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
let mut tracer = if tenant.get_trace_read_requests() {
|
||||
let connection_id = ConnectionId::generate();
|
||||
let path = tenant
|
||||
@@ -370,9 +402,14 @@ impl PageServerHandler {
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
// Avoid starting new requests if the timeline has already started shutting down,
|
||||
// and block timeline shutdown until this request is complete, or drops out due
|
||||
// to cancellation.
|
||||
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
pgb.flush().await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
|
||||
|
||||
@@ -380,10 +417,10 @@ impl PageServerHandler {
|
||||
let msg = tokio::select! {
|
||||
biased;
|
||||
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
_ = timeline.cancel.cancelled() => {
|
||||
// We were requested to shut down.
|
||||
info!("shutdown request received in page handler");
|
||||
break;
|
||||
return Err(QueryError::Shutdown)
|
||||
}
|
||||
|
||||
msg = pgb.read_message() => { msg }
|
||||
@@ -455,9 +492,20 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = &response {
|
||||
if timeline.cancel.is_cancelled() {
|
||||
// If we fail to fulfil a request during shutdown, which may be _because_ of
|
||||
// shutdown, then do not send the error to the client. Instead just drop the
|
||||
// connection.
|
||||
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
|
||||
return Err(QueryError::Shutdown);
|
||||
}
|
||||
}
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
// error message is enough. Do not log if shutting down, as the anyhow::Error
|
||||
// here includes cancellation which is not an error.
|
||||
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
@@ -465,7 +513,7 @@ impl PageServerHandler {
|
||||
});
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
|
||||
pgb.flush().await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -487,10 +535,14 @@ impl PageServerHandler {
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
let timeline = tenant
|
||||
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
|
||||
.await?;
|
||||
@@ -508,9 +560,9 @@ impl PageServerHandler {
|
||||
// Import basebackup provided via CopyData
|
||||
info!("importing basebackup");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
self.flush_cancellable(pgb, &tenant.cancel).await?;
|
||||
|
||||
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(
|
||||
&mut copyin_reader,
|
||||
@@ -547,9 +599,10 @@ impl PageServerHandler {
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
if last_record_lsn != start_lsn {
|
||||
return Err(QueryError::Other(
|
||||
@@ -563,8 +616,8 @@ impl PageServerHandler {
|
||||
// Import wal provided via CopyData
|
||||
info!("importing wal");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
|
||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||
info!("wal import complete");
|
||||
|
||||
@@ -757,7 +810,9 @@ impl PageServerHandler {
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
@@ -772,7 +827,7 @@ impl PageServerHandler {
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
|
||||
pgb.flush().await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
// Send a tarball of the latest layer on the timeline. Compress if not
|
||||
// fullbackup. TODO Compress in that case too (tests need to be updated)
|
||||
@@ -824,7 +879,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CopyDone)?;
|
||||
pgb.flush().await?;
|
||||
self.flush_cancellable(pgb, &timeline.cancel).await?;
|
||||
|
||||
let basebackup_after = started
|
||||
.elapsed()
|
||||
@@ -856,6 +911,25 @@ impl PageServerHandler {
|
||||
.expect("claims presence already checked");
|
||||
check_permission(claims, tenant_id)
|
||||
}
|
||||
|
||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||
async fn get_active_tenant_timeline(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -1013,7 +1087,9 @@ where
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
|
||||
@@ -1197,7 +1273,12 @@ where
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
|
||||
let tenant = get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
ACTIVE_TENANT_TIMEOUT,
|
||||
&task_mgr::shutdown_token(),
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"checkpoint_timeout"),
|
||||
@@ -1243,67 +1324,13 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum GetActiveTenantError {
|
||||
#[error(
|
||||
"Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
|
||||
)]
|
||||
WaitForActiveTimeout {
|
||||
latest_state: TenantState,
|
||||
wait_time: Duration,
|
||||
},
|
||||
#[error(transparent)]
|
||||
NotFound(GetTenantError),
|
||||
#[error(transparent)]
|
||||
WaitTenantActive(tenant::WaitToBecomeActiveError),
|
||||
}
|
||||
|
||||
impl From<GetActiveTenantError> for QueryError {
|
||||
fn from(e: GetActiveTenantError) -> Self {
|
||||
match e {
|
||||
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
|
||||
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
|
||||
),
|
||||
GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
|
||||
GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Get active tenant.
|
||||
///
|
||||
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
|
||||
/// ensures that queries don't fail immediately after pageserver startup, because
|
||||
/// all tenants are still loading.
|
||||
async fn get_active_tenant_with_timeout(
|
||||
tenant_id: TenantId,
|
||||
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
|
||||
) -> Result<Arc<Tenant>, GetActiveTenantError> {
|
||||
let tenant = match mgr::get_tenant(tenant_id, false).await {
|
||||
Ok(tenant) => tenant,
|
||||
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
|
||||
Err(GetTenantError::NotActive(_)) => {
|
||||
unreachable!("we're calling get_tenant with active_only=false")
|
||||
}
|
||||
Err(GetTenantError::Broken(_)) => {
|
||||
unreachable!("we're calling get_tenant with active_only=false")
|
||||
}
|
||||
};
|
||||
let wait_time = Duration::from_secs(30);
|
||||
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
|
||||
Ok(Ok(())) => Ok(tenant),
|
||||
// no .context(), the error message is good enough and some tests depend on it
|
||||
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
|
||||
Err(_) => {
|
||||
let latest_state = tenant.current_state();
|
||||
if latest_state == TenantState::Active {
|
||||
Ok(tenant)
|
||||
} else {
|
||||
Err(GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state,
|
||||
wait_time,
|
||||
})
|
||||
}
|
||||
e => QueryError::Other(anyhow::anyhow!(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1324,18 +1351,3 @@ impl From<GetActiveTimelineError> for QueryError {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Shorthand for getting a reference to a Timeline of an Active tenant.
|
||||
async fn get_active_tenant_timeline(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
|
||||
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::ControlFlow;
|
||||
use std::ops::Range;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, trace, warn};
|
||||
@@ -43,6 +44,17 @@ pub enum CalculateLogicalSizeError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<PageReconstructError> for CalculateLogicalSizeError {
|
||||
fn from(pre: PageReconstructError) -> Self {
|
||||
match pre {
|
||||
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
|
||||
Self::Cancelled
|
||||
}
|
||||
_ => Self::Other(pre.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RelationError {
|
||||
#[error("Relation Already Exists")]
|
||||
@@ -370,7 +382,6 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
|
||||
/// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
|
||||
///
|
||||
@@ -385,6 +396,50 @@ impl Timeline {
|
||||
found_larger: &mut bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return ControlFlow::Break(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
/// Obtain the possible timestamp range for the given lsn.
|
||||
///
|
||||
/// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
|
||||
pub async fn get_timestamp_for_lsn(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Option<TimestampTz>, PageReconstructError> {
|
||||
let mut max: Option<TimestampTz> = None;
|
||||
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
|
||||
if let Some(max_prev) = max {
|
||||
max = Some(max_prev.max(timestamp));
|
||||
} else {
|
||||
max = Some(timestamp);
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(max)
|
||||
}
|
||||
|
||||
/// Runs the given function on all the timestamps for a given lsn
|
||||
///
|
||||
/// The return value is either given by the closure, or set to the `Default`
|
||||
/// impl's output.
|
||||
async fn map_all_timestamps<T: Default>(
|
||||
&self,
|
||||
probe_lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
|
||||
) -> Result<T, PageReconstructError> {
|
||||
for segno in self
|
||||
.list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
|
||||
.await?
|
||||
@@ -402,16 +457,14 @@ impl Timeline {
|
||||
timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return Ok(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
match f(timestamp) {
|
||||
ControlFlow::Break(b) => return Ok(b),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
Ok(Default::default())
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
@@ -499,6 +552,24 @@ impl Timeline {
|
||||
self.get(CHECKPOINT_KEY, lsn, ctx).await
|
||||
}
|
||||
|
||||
pub async fn list_aux_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashMap<String, Bytes>, PageReconstructError> {
|
||||
match self.get(AUX_FILES_KEY, lsn, ctx).await {
|
||||
Ok(buf) => match AuxFilesDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => Ok(dir.files),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
},
|
||||
Err(e) => {
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
Ok(HashMap::new())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used to initialize the logical size tracking on startup.
|
||||
///
|
||||
@@ -513,7 +584,7 @@ impl Timeline {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
@@ -527,10 +598,7 @@ impl Timeline {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self
|
||||
.get(relsize_key, lsn, ctx)
|
||||
.await
|
||||
.with_context(|| format!("read relation size of {rel:?}"))?;
|
||||
let mut buf = self.get(relsize_key, lsn, ctx).await?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
total_size += relsize as u64;
|
||||
@@ -616,7 +684,9 @@ impl Timeline {
|
||||
|
||||
result.add_key(CONTROLFILE_KEY);
|
||||
result.add_key(CHECKPOINT_KEY);
|
||||
|
||||
if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
|
||||
result.add_key(AUX_FILES_KEY);
|
||||
}
|
||||
Ok(result.to_keyspace())
|
||||
}
|
||||
|
||||
@@ -692,6 +762,12 @@ impl<'a> DatadirModification<'a> {
|
||||
})?;
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
|
||||
let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
|
||||
xids: HashSet::new(),
|
||||
})?;
|
||||
@@ -796,6 +872,12 @@ impl<'a> DatadirModification<'a> {
|
||||
// 'true', now write the updated 'dbdirs' map back.
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// Create AuxFilesDirectory as well
|
||||
let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
})?;
|
||||
self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
|
||||
}
|
||||
if r.is_none() {
|
||||
// Create RelDirectory
|
||||
@@ -1120,6 +1202,37 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn put_file(
|
||||
&mut self,
|
||||
path: &str,
|
||||
content: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
|
||||
Ok(buf) => AuxFilesDirectory::des(&buf)?,
|
||||
Err(e) => {
|
||||
// This is expected: historical databases do not have the key.
|
||||
debug!("Failed to get info about AUX files: {}", e);
|
||||
AuxFilesDirectory {
|
||||
files: HashMap::new(),
|
||||
}
|
||||
}
|
||||
};
|
||||
let path = path.to_string();
|
||||
if content.is_empty() {
|
||||
dir.files.remove(&path);
|
||||
} else {
|
||||
dir.files.insert(path, Bytes::copy_from_slice(content));
|
||||
}
|
||||
self.put(
|
||||
AUX_FILES_KEY,
|
||||
Value::Image(Bytes::from(
|
||||
AuxFilesDirectory::ser(&dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Flush changes accumulated so far to the underlying repository.
|
||||
///
|
||||
@@ -1255,6 +1368,11 @@ struct RelDirectory {
|
||||
rels: HashSet<(Oid, u8)>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Default)]
|
||||
struct AuxFilesDirectory {
|
||||
files: HashMap<String, Bytes>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct RelSizeEntry {
|
||||
nblocks: u32,
|
||||
@@ -1303,10 +1421,12 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
// 02 pg_twophase
|
||||
//
|
||||
// 03 misc
|
||||
// controlfile
|
||||
// Controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// 04 aux files
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
// DbDir:
|
||||
@@ -1344,6 +1464,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
//
|
||||
// AuxFiles:
|
||||
// 03 00000000 00000000 00000000 00 00000002
|
||||
//
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
const DBDIR_KEY: Key = Key {
|
||||
@@ -1567,6 +1692,15 @@ const CHECKPOINT_KEY: Key = Key {
|
||||
field6: 1,
|
||||
};
|
||||
|
||||
const AUX_FILES_KEY: Key = Key {
|
||||
field1: 0x03,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 2,
|
||||
};
|
||||
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
|
||||
@@ -299,10 +299,6 @@ pub enum TaskKind {
|
||||
|
||||
#[derive(Default)]
|
||||
struct MutableTaskState {
|
||||
/// Tenant and timeline that this task is associated with.
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
/// Handle for waiting for the task to exit. It can be None, if the
|
||||
/// the task has already exited.
|
||||
join_handle: Option<JoinHandle<()>>,
|
||||
@@ -319,6 +315,11 @@ struct PageServerTask {
|
||||
// To request task shutdown, just cancel this token.
|
||||
cancel: CancellationToken,
|
||||
|
||||
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
|
||||
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
|
||||
tenant_id: Option<TenantId>,
|
||||
timeline_id: Option<TimelineId>,
|
||||
|
||||
mutable: Mutex<MutableTaskState>,
|
||||
}
|
||||
|
||||
@@ -344,11 +345,9 @@ where
|
||||
kind,
|
||||
name: name.to_string(),
|
||||
cancel: cancel.clone(),
|
||||
mutable: Mutex::new(MutableTaskState {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
join_handle: None,
|
||||
}),
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
mutable: Mutex::new(MutableTaskState { join_handle: None }),
|
||||
});
|
||||
|
||||
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
|
||||
@@ -418,8 +417,6 @@ async fn task_finish(
|
||||
|
||||
let mut shutdown_process = false;
|
||||
{
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => {
|
||||
debug!("Task '{}' exited normally", task_name);
|
||||
@@ -428,13 +425,13 @@ async fn task_finish(
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -442,13 +439,13 @@ async fn task_finish(
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
shutdown_process = true;
|
||||
} else {
|
||||
error!(
|
||||
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
task_name, task_mut.tenant_id, task_mut.timeline_id, err
|
||||
task_name, task.tenant_id, task.timeline_id, err
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -460,17 +457,6 @@ async fn task_finish(
|
||||
}
|
||||
}
|
||||
|
||||
// expected to be called from the task of the given id.
|
||||
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
|
||||
CURRENT_TASK.with(|ct| {
|
||||
let mut task_mut = ct.mutable.lock().unwrap();
|
||||
task_mut.tenant_id = tenant_id;
|
||||
task_mut.timeline_id = timeline_id;
|
||||
});
|
||||
}
|
||||
|
||||
/// Is there a task running that matches the criteria
|
||||
|
||||
/// Signal and wait for tasks to shut down.
|
||||
///
|
||||
///
|
||||
@@ -493,17 +479,16 @@ pub async fn shutdown_tasks(
|
||||
{
|
||||
let tasks = TASKS.lock().unwrap();
|
||||
for task in tasks.values() {
|
||||
let task_mut = task.mutable.lock().unwrap();
|
||||
if (kind.is_none() || Some(task.kind) == kind)
|
||||
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
|
||||
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
|
||||
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
|
||||
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
|
||||
{
|
||||
task.cancel.cancel();
|
||||
victim_tasks.push((
|
||||
Arc::clone(task),
|
||||
task.kind,
|
||||
task_mut.tenant_id,
|
||||
task_mut.timeline_id,
|
||||
task.tenant_id,
|
||||
task.timeline_id,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -186,26 +186,21 @@ impl FileBlockReader {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, std::io::Error> {
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => break Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// Swap for read lock
|
||||
continue;
|
||||
}
|
||||
};
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => Ok(guard.into()),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
self.fill_buffer(write_guard.deref_mut(), blknum).await?;
|
||||
Ok(write_guard.mark_valid().into())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,10 +3,10 @@ use std::sync::Arc;
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, instrument, warn, Instrument, Span};
|
||||
use tracing::{error, instrument, warn, Instrument, Span};
|
||||
|
||||
use utils::{
|
||||
backoff, completion, crashsafe, fs_ext,
|
||||
@@ -21,26 +21,33 @@ use crate::{
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantsMap},
|
||||
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
|
||||
};
|
||||
|
||||
const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum DeleteTenantError {
|
||||
pub(crate) enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
Get(#[from] GetTenantError),
|
||||
|
||||
#[error("Tenant not attached")]
|
||||
NotAttached,
|
||||
|
||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||
InvalidState(TenantState),
|
||||
|
||||
#[error("Tenant deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error("Tenant map slot error {0}")]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
|
||||
#[error("Tenant map slot upsert error {0}")]
|
||||
SlotUpsertError(#[from] TenantSlotUpsertError),
|
||||
|
||||
#[error("Timeline {0}")]
|
||||
Timeline(#[from] DeleteTimelineError),
|
||||
|
||||
@@ -60,7 +67,7 @@ fn remote_tenant_delete_mark_path(
|
||||
.context("Failed to strip workdir prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.context("tenant path")?;
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
@@ -150,7 +157,8 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
|
||||
// Assert timelines dir is empty.
|
||||
if !fs_ext::is_directory_empty(timelines_path).await? {
|
||||
// Display first 10 items in directory
|
||||
let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
|
||||
let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
|
||||
let list = &list.into_iter().take(10).collect::<Vec<_>>();
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"Timelines directory is not empty after all timelines deletion: {list:?}"
|
||||
)));
|
||||
@@ -239,32 +247,6 @@ async fn cleanup_remaining_fs_traces(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn remote_delete_mark_exists(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
) -> anyhow::Result<bool> {
|
||||
// If remote storage is there we rely on it
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id).context("path")?;
|
||||
|
||||
let result = backoff::retry(
|
||||
|| async { remote_storage.download(&remote_mark_path).await },
|
||||
|e| matches!(e, DownloadError::NotFound),
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS,
|
||||
"fetch_tenant_deletion_mark",
|
||||
// TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(_) => Ok(true),
|
||||
Err(DownloadError::NotFound) => Ok(false),
|
||||
Err(e) => Err(anyhow::anyhow!(e)).context("remote_delete_mark_exists")?,
|
||||
}
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -276,10 +258,9 @@ pub(crate) async fn remote_delete_mark_exists(
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are three entrypoints to the process:
|
||||
/// There are two entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_load`] is called during restarts when local or remote deletion marks are still there.
|
||||
/// 3. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
@@ -301,12 +282,12 @@ impl DeleteTenantFlow {
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
@@ -376,9 +357,9 @@ impl DeleteTenantFlow {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn should_resume_deletion(
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
remote_mark_exists: bool,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
@@ -389,66 +370,25 @@ impl DeleteTenantFlow {
|
||||
)
|
||||
};
|
||||
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
if remote_mark_exists {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
let remote_storage = match remote_storage {
|
||||
Some(remote_storage) => remote_storage,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
if remote_delete_mark_exists(conf, &tenant_id, remote_storage).await? {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
Ok(acquire(tenant))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_load(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
init_order: Option<&InitializationOrder>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
tenant
|
||||
.set_stopping(progress, true, false)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
// Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
|
||||
let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
info!("waiting for backgound jobs barrier");
|
||||
background.clone().wait().await;
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
// Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
|
||||
let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
|
||||
if timelines_path.exists() {
|
||||
tenant.load(init_order, ctx).await.context("load")?;
|
||||
}
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_attach(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let (_, progress) = completion::channel();
|
||||
@@ -458,7 +398,10 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant.attach(ctx).await.context("attach")?;
|
||||
tenant
|
||||
.attach(init_order, preload, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
@@ -471,15 +414,8 @@ impl DeleteTenantFlow {
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenants: &tokio::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
|
||||
let m = tenants.read().await;
|
||||
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.ok_or(GetTenantError::NotFound(tenant_id))?;
|
||||
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||
// Broken and Stopping is needed for retries.
|
||||
@@ -513,14 +449,14 @@ impl DeleteTenantFlow {
|
||||
)));
|
||||
}
|
||||
|
||||
Ok((Arc::clone(tenant), guard))
|
||||
Ok(guard)
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
@@ -553,7 +489,7 @@ impl DeleteTenantFlow {
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static tokio::sync::RwLock<TenantsMap>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||
@@ -601,10 +537,18 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
let mut locked = tenants.write().await;
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
{
|
||||
let mut locked = tenants.write().unwrap();
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
|
||||
// FIXME: we should not be modifying this from outside of mgr.rs.
|
||||
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
|
||||
crate::metrics::TENANT_MANAGER
|
||||
.tenant_slots
|
||||
.set(locked.len() as u64);
|
||||
}
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -573,10 +573,10 @@ impl<const L: usize> BuildNode<L> {
|
||||
BuildNode {
|
||||
num_children: 0,
|
||||
level,
|
||||
prefix: Vec::new(),
|
||||
prefix: Vec::with_capacity(16),
|
||||
suffix_len: 0,
|
||||
keys: Vec::new(),
|
||||
values: Vec::new(),
|
||||
keys: Vec::with_capacity(5024),
|
||||
values: Vec::with_capacity(3140),
|
||||
size: NODE_HDR_SIZE,
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user