mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-14 03:30:36 +00:00
Compare commits
157 Commits
startup-no
...
extension_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
38b0db734e | ||
|
|
316082d770 | ||
|
|
8b586ea748 | ||
|
|
b4abbfe6fb | ||
|
|
06763949c0 | ||
|
|
a6c9a4abe7 | ||
|
|
0875c2284c | ||
|
|
2089d02f94 | ||
|
|
a6f142c4f6 | ||
|
|
ce0da2889e | ||
|
|
ceebc8e870 | ||
|
|
1d407f937f | ||
|
|
b402dfd9c7 | ||
|
|
5e882d8359 | ||
|
|
b999b27655 | ||
|
|
ebe503e6f8 | ||
|
|
604d2643d4 | ||
|
|
7357b7cad5 | ||
|
|
a2e154f07b | ||
|
|
7667fdc5c8 | ||
|
|
85d02acdb4 | ||
|
|
5f986875bf | ||
|
|
1104de0b9b | ||
|
|
1faf69a698 | ||
|
|
44a441080d | ||
|
|
c215389f1c | ||
|
|
b1477b4448 | ||
|
|
a500bb06fb | ||
|
|
a8f848b5de | ||
|
|
15456625c2 | ||
|
|
ca59330df8 | ||
|
|
36bb5ad527 | ||
|
|
6532daf528 | ||
|
|
8bc128e474 | ||
|
|
c3994541eb | ||
|
|
7cdcc8a500 | ||
|
|
a3f0dd2d30 | ||
|
|
76718472be | ||
|
|
4201f4695f | ||
|
|
5776df15da | ||
|
|
31aa0283b0 | ||
|
|
fd3dfe9d52 | ||
|
|
c07b6ffbdc | ||
|
|
6c3605fc24 | ||
|
|
d96d51a3b7 | ||
|
|
a010b2108a | ||
|
|
384e3ab1a8 | ||
|
|
4259464f72 | ||
|
|
152206211b | ||
|
|
9c35c06c58 | ||
|
|
245b4c9d72 | ||
|
|
3d0f74fc0c | ||
|
|
ce55f70cac | ||
|
|
053d592ddb | ||
|
|
b85416b58d | ||
|
|
195838436c | ||
|
|
9313045de6 | ||
|
|
44ac7a45be | ||
|
|
e35e8a7dcb | ||
|
|
a79b0d69c4 | ||
|
|
2f618f46be | ||
|
|
d475e901e5 | ||
|
|
bf3b83b504 | ||
|
|
94781e8710 | ||
|
|
4b83a206bf | ||
|
|
f984f9e7d3 | ||
|
|
6b42464c23 | ||
|
|
605c30e5c5 | ||
|
|
0b11d8e836 | ||
|
|
7602483af9 | ||
|
|
5e1e859ab8 | ||
|
|
85a7511700 | ||
|
|
89b8ea132e | ||
|
|
bfbae98f24 | ||
|
|
02a1d4d8c1 | ||
|
|
d3aa8a48ea | ||
|
|
4a35f29301 | ||
|
|
559e318328 | ||
|
|
a4d236b02f | ||
|
|
e4da76f021 | ||
|
|
8b9f72e117 | ||
|
|
bb414e5a0a | ||
|
|
32c03bc784 | ||
|
|
c99e203094 | ||
|
|
870740c949 | ||
|
|
75d583c04a | ||
|
|
e7b9259675 | ||
|
|
356f7d3a7e | ||
|
|
0f6b05337e | ||
|
|
2e81d280c8 | ||
|
|
f9700c8bb9 | ||
|
|
e6137d45d2 | ||
|
|
ab1d903600 | ||
|
|
bfd670b9a7 | ||
|
|
5e96ab43ea | ||
|
|
890061d371 | ||
|
|
b4c5beff9f | ||
|
|
90e1f629e8 | ||
|
|
6b74d1a76a | ||
|
|
2023e22ed3 | ||
|
|
036fda392f | ||
|
|
557abc18f3 | ||
|
|
3b06a5bc54 | ||
|
|
a936b8a92b | ||
|
|
c7bea52849 | ||
|
|
1b7ab6d468 | ||
|
|
e07d5d00e9 | ||
|
|
15d3d007eb | ||
|
|
77157c7741 | ||
|
|
b9b1b3596c | ||
|
|
1b947fc8af | ||
|
|
78082d0b9f | ||
|
|
190c3ba610 | ||
|
|
14d495ae14 | ||
|
|
91a809332f | ||
|
|
472cc17b7a | ||
|
|
214ecacfc4 | ||
|
|
76413a0fb8 | ||
|
|
e60b70b475 | ||
|
|
2252c5c282 | ||
|
|
7465c644b9 | ||
|
|
94f315d490 | ||
|
|
cd3faa8c0c | ||
|
|
a7a0c3cd27 | ||
|
|
ee9a5bae43 | ||
|
|
bb931f2ce0 | ||
|
|
9484b96d7c | ||
|
|
ebee8247b5 | ||
|
|
3164ad7052 | ||
|
|
a0b3990411 | ||
|
|
4385e0c291 | ||
|
|
3693d1f431 | ||
|
|
8013f9630d | ||
|
|
34f22e9b12 | ||
|
|
fdf7a67ed2 | ||
|
|
1299df87d2 | ||
|
|
754ceaefac | ||
|
|
143fa0da42 | ||
|
|
4936ab6842 | ||
|
|
939593d0d3 | ||
|
|
2011cc05cd | ||
|
|
b0286e3c46 | ||
|
|
4f3f817384 | ||
|
|
b50475b567 | ||
|
|
e4f05ce0a2 | ||
|
|
8d106708d7 | ||
|
|
f450369b20 | ||
|
|
aad918fb56 | ||
|
|
86dd8c96d3 | ||
|
|
6a65c4a4fe | ||
|
|
e9072ee178 | ||
|
|
7e17979d7a | ||
|
|
227271ccad | ||
|
|
fbf0367e27 | ||
|
|
a21b55fe0b | ||
|
|
add51e1372 | ||
|
|
cdce04d721 |
9
.github/workflows/benchmarking.yml
vendored
9
.github/workflows/benchmarking.yml
vendored
@@ -180,7 +180,8 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
# Increase timeout to 8h, default timeout is 6h
|
||||
timeout-minutes: 480
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
@@ -321,8 +322,6 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@@ -414,8 +413,6 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
@@ -501,8 +498,6 @@ jobs:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
|
||||
61
.github/workflows/build_and_test.yml
vendored
61
.github/workflows/build_and_test.yml
vendored
@@ -623,51 +623,6 @@ jobs:
|
||||
- name: Cleanup ECR folder
|
||||
run: rm -rf ~/.ecr
|
||||
|
||||
|
||||
neon-image-depot:
|
||||
# For testing this will run side-by-side for a few merges.
|
||||
# This action is not really optimized yet, but gets the job done
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ tag ]
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
|
||||
permissions:
|
||||
contents: read
|
||||
id-token: write
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup go
|
||||
uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version: '1.19'
|
||||
|
||||
- name: Set up Depot CLI
|
||||
uses: depot/setup-action@v1
|
||||
|
||||
- name: Install Crane & ECR helper
|
||||
run: go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
mkdir /github/home/.docker/
|
||||
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
|
||||
|
||||
- name: Build and push
|
||||
uses: depot/build-push-action@v1
|
||||
with:
|
||||
# if no depot.json file is at the root of your repo, you must specify the project id
|
||||
project: nrdv0s4kcs
|
||||
push: true
|
||||
tags: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:depot-${{needs.tag.outputs.build-tag}}
|
||||
build-args: |
|
||||
GIT_VERSION=${{ github.sha }}
|
||||
REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
|
||||
compute-tools-image:
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
needs: [ tag ]
|
||||
@@ -704,6 +659,7 @@ jobs:
|
||||
--cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-tools
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
|
||||
@@ -761,6 +717,7 @@ jobs:
|
||||
--context .
|
||||
--build-arg GIT_VERSION=${{ github.sha }}
|
||||
--build-arg PG_VERSION=${{ matrix.version }}
|
||||
--build-arg BUILD_TAG=${{needs.tag.outputs.build-tag}}
|
||||
--build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
--dockerfile Dockerfile.compute-node
|
||||
--destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
@@ -959,6 +916,20 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Create git tag
|
||||
if: github.ref_name == 'release'
|
||||
uses: actions/github-script@v6
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
github.rest.git.createRef({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
ref: "refs/tags/${{ needs.tag.outputs.build-tag }}",
|
||||
sha: context.sha,
|
||||
})
|
||||
|
||||
promote-compatibility-data:
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
|
||||
1
.github/workflows/release.yml
vendored
1
.github/workflows/release.yml
vendored
@@ -3,6 +3,7 @@ name: Create Release Branch
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 10 * * 2'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
create_release_branch:
|
||||
|
||||
21
Cargo.lock
generated
21
Cargo.lock
generated
@@ -924,12 +924,14 @@ dependencies = [
|
||||
"opentelemetry",
|
||||
"postgres",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
"tracing-subscriber",
|
||||
@@ -997,6 +999,7 @@ dependencies = [
|
||||
"tar",
|
||||
"thiserror",
|
||||
"toml",
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
@@ -2349,9 +2352,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.52"
|
||||
version = "0.10.55"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
|
||||
checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
@@ -2381,9 +2384,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.87"
|
||||
version = "0.9.90"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
|
||||
checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
@@ -2770,7 +2773,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres"
|
||||
version = "0.19.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -2783,7 +2786,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-native-tls"
|
||||
version = "0.5.0"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
@@ -2794,7 +2797,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-protocol"
|
||||
version = "0.6.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"base64 0.20.0",
|
||||
"byteorder",
|
||||
@@ -2812,7 +2815,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "postgres-types"
|
||||
version = "0.2.4"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fallible-iterator",
|
||||
@@ -4272,7 +4275,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-postgres"
|
||||
version = "0.7.7"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
|
||||
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
|
||||
12
Cargo.toml
12
Cargo.toml
@@ -140,11 +140,11 @@ env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
|
||||
## Other git libraries
|
||||
@@ -180,7 +180,7 @@ tonic-build = "0.9"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
|
||||
|
||||
# Changes the MAX_THREADS limit from 4096 to 32768.
|
||||
# This is a temporary workaround for using tracing from many threads in safekeepers code,
|
||||
|
||||
@@ -2,6 +2,7 @@ ARG PG_VERSION
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -67,7 +68,7 @@ RUN apt update && \
|
||||
RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
|
||||
echo "4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 SFCGAL.tar.gz" | sha256sum --check && \
|
||||
mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
|
||||
cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release . && make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make clean && cp -R /sfcgal/* /
|
||||
|
||||
@@ -95,7 +96,7 @@ RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouti
|
||||
mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake .. && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
|
||||
@@ -355,7 +356,7 @@ RUN apt-get update && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/2.10.1.tar.gz -O timescaledb.tar.gz && \
|
||||
echo "6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 timescaledb.tar.gz" | sha256sum --check && \
|
||||
mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
|
||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON && \
|
||||
./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
|
||||
cd build && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make install -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
@@ -410,7 +411,7 @@ RUN apt-get update && \
|
||||
mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake .. && \
|
||||
cmake -DCMAKE_BUILD_TYPE=Release .. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
|
||||
@@ -432,6 +433,71 @@ RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.5.2.tar.gz -O
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_cron.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rdkit-pg-build"
|
||||
# compile rdkit extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rdkit-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake \
|
||||
libboost-iostreams1.74-dev \
|
||||
libboost-regex1.74-dev \
|
||||
libboost-serialization1.74-dev \
|
||||
libboost-system1.74-dev \
|
||||
libeigen3-dev \
|
||||
libfreetype6-dev
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
|
||||
RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_1.tar.gz -O rdkit.tar.gz && \
|
||||
echo "db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c rdkit.tar.gz" | sha256sum --check && \
|
||||
mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
|
||||
cmake \
|
||||
-D RDK_BUILD_CAIRO_SUPPORT=OFF \
|
||||
-D RDK_BUILD_INCHI_SUPPORT=ON \
|
||||
-D RDK_BUILD_AVALON_SUPPORT=ON \
|
||||
-D RDK_BUILD_PYTHON_WRAPPERS=OFF \
|
||||
-D RDK_BUILD_DESCRIPTORS3D=OFF \
|
||||
-D RDK_BUILD_FREESASA_SUPPORT=OFF \
|
||||
-D RDK_BUILD_COORDGEN_SUPPORT=ON \
|
||||
-D RDK_BUILD_MOLINTERCHANGE_SUPPORT=OFF \
|
||||
-D RDK_BUILD_YAEHMOP_SUPPORT=OFF \
|
||||
-D RDK_BUILD_STRUCTCHECKER_SUPPORT=OFF \
|
||||
-D RDK_USE_URF=OFF \
|
||||
-D RDK_BUILD_PGSQL=ON \
|
||||
-D RDK_PGSQL_STATIC=ON \
|
||||
-D PostgreSQL_CONFIG=pg_config \
|
||||
-D PostgreSQL_INCLUDE_DIR=`pg_config --includedir` \
|
||||
-D PostgreSQL_TYPE_INCLUDE_DIR=`pg_config --includedir-server` \
|
||||
-D PostgreSQL_LIBRARY_DIR=`pg_config --libdir` \
|
||||
-D RDK_INSTALL_INTREE=OFF \
|
||||
-D CMAKE_BUILD_TYPE=Release \
|
||||
. && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rdkit.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-uuidv7-pg-build"
|
||||
# compile pg_uuidv7 extension
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS pg-uuidv7-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
ENV PATH "/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
|
||||
echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xvzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_uuidv7.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions"
|
||||
@@ -564,6 +630,8 @@ COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -585,6 +653,9 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
#
|
||||
#########################################################################################
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
@@ -637,14 +708,19 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
|
||||
# libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
|
||||
# libxml2, libxslt1.1 for xml2
|
||||
# libzstd1 for zstd
|
||||
# libboost*, libfreetype6, and zlib1g for rdkit
|
||||
RUN apt update && \
|
||||
apt install --no-install-recommends -y \
|
||||
gdb \
|
||||
locales \
|
||||
libicu67 \
|
||||
liblz4-1 \
|
||||
libreadline8 \
|
||||
libboost-iostreams1.74.0 \
|
||||
libboost-regex1.74.0 \
|
||||
libboost-serialization1.74.0 \
|
||||
libboost-system1.74.0 \
|
||||
libossp-uuid16 \
|
||||
libfreetype6 \
|
||||
libgeos-c1v5 \
|
||||
libgdal28 \
|
||||
libproj19 \
|
||||
@@ -654,7 +730,9 @@ RUN apt update && \
|
||||
libxslt1.1 \
|
||||
libzstd1 \
|
||||
libcurl4-openssl-dev \
|
||||
procps && \
|
||||
locales \
|
||||
procps \
|
||||
zlib1g && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
|
||||
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
ARG REPOSITORY=neondatabase
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
ARG BUILD_TAG
|
||||
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
|
||||
WORKDIR /home/nonroot
|
||||
@@ -16,6 +17,8 @@ ENV CACHEPOT_S3_KEY_PREFIX=cachepot
|
||||
ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
#ARG AWS_ACCESS_KEY_ID
|
||||
#ARG AWS_SECRET_ACCESS_KEY
|
||||
ARG BUILD_TAG
|
||||
ENV BUILD_TAG=$BUILD_TAG
|
||||
|
||||
COPY . .
|
||||
|
||||
|
||||
@@ -30,3 +30,5 @@ url.workspace = true
|
||||
compute_api.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
toml_edit.workspace = true
|
||||
remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
|
||||
|
||||
@@ -27,7 +27,8 @@
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://cloud_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! -b /usr/local/bin/postgres \
|
||||
//! -r {"bucket": "my-bucket", "region": "eu-central-1"}
|
||||
//! ```
|
||||
//!
|
||||
use std::collections::HashMap;
|
||||
@@ -35,7 +36,7 @@ use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex};
|
||||
use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
@@ -48,16 +49,37 @@ use compute_api::responses::ComputeStatus;
|
||||
|
||||
use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
|
||||
use compute_tools::configurator::launch_configurator;
|
||||
use compute_tools::extension_server::{get_pg_version, init_remote_storage};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
const BUILD_TAG_DEFAULT: &str = "local";
|
||||
|
||||
fn main() -> Result<()> {
|
||||
init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT);
|
||||
|
||||
info!("build_tag: {build_tag}");
|
||||
|
||||
let matches = cli().get_matches();
|
||||
let pgbin_default = String::from("postgres");
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
|
||||
|
||||
let remote_ext_config = matches.get_one::<String>("remote-ext-config");
|
||||
let ext_remote_storage = match remote_ext_config {
|
||||
Some(x) => match init_remote_storage(x) {
|
||||
Ok(y) => Some(y),
|
||||
Err(e) => {
|
||||
dbg!("Error {:?}, setting remote storage to None", e);
|
||||
None
|
||||
}
|
||||
},
|
||||
None => None,
|
||||
};
|
||||
|
||||
let http_port = *matches
|
||||
.get_one::<u16>("http-port")
|
||||
@@ -122,9 +144,6 @@ fn main() -> Result<()> {
|
||||
let compute_id = matches.get_one::<String>("compute-id");
|
||||
let control_plane_uri = matches.get_one::<String>("control-plane-uri");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.get_one::<String>("pgbin").unwrap();
|
||||
|
||||
let spec;
|
||||
let mut live_config_allowed = false;
|
||||
match spec_json {
|
||||
@@ -162,6 +181,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let mut new_state = ComputeState::new();
|
||||
let spec_set;
|
||||
|
||||
if let Some(spec) = spec {
|
||||
let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
|
||||
new_state.pspec = Some(pspec);
|
||||
@@ -173,9 +193,13 @@ fn main() -> Result<()> {
|
||||
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
pgversion: get_pg_version(pgbin),
|
||||
live_config_allowed,
|
||||
state: Mutex::new(new_state),
|
||||
state_changed: Condvar::new(),
|
||||
ext_remote_storage,
|
||||
available_libraries: OnceLock::new(),
|
||||
available_extensions: OnceLock::new(),
|
||||
};
|
||||
let compute = Arc::new(compute_node);
|
||||
|
||||
@@ -184,6 +208,8 @@ fn main() -> Result<()> {
|
||||
let _http_handle =
|
||||
launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");
|
||||
|
||||
let extension_server_port: u16 = http_port;
|
||||
|
||||
if !spec_set {
|
||||
// No spec provided, hang waiting for it.
|
||||
info!("no compute spec provided, waiting");
|
||||
@@ -224,7 +250,7 @@ fn main() -> Result<()> {
|
||||
// Start Postgres
|
||||
let mut delay_exit = false;
|
||||
let mut exit_code = None;
|
||||
let pg = match compute.start_compute() {
|
||||
let pg = match compute.start_compute(extension_server_port) {
|
||||
Ok(pg) => Some(pg),
|
||||
Err(err) => {
|
||||
error!("could not start the compute node: {:?}", err);
|
||||
@@ -343,6 +369,12 @@ fn cli() -> clap::Command {
|
||||
.long("control-plane-uri")
|
||||
.value_name("CONTROL_PLANE_API_BASE_URI"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("remote-ext-config")
|
||||
.short('r')
|
||||
.long("remote-ext-config")
|
||||
.value_name("REMOTE_EXT_CONFIG"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Condvar, Mutex};
|
||||
use std::sync::{Condvar, Mutex, OnceLock};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -16,9 +18,11 @@ use utils::lsn::Lsn;
|
||||
use compute_api::responses::{ComputeMetrics, ComputeStatus};
|
||||
use compute_api::spec::{ComputeMode, ComputeSpec};
|
||||
|
||||
use crate::config;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
use crate::{config, extension_server};
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
@@ -26,6 +30,7 @@ pub struct ComputeNode {
|
||||
pub connstr: url::Url,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub pgversion: String,
|
||||
/// We should only allow live re- / configuration of the compute node if
|
||||
/// it uses 'pull model', i.e. it can go to control-plane and fetch
|
||||
/// the latest configuration. Otherwise, there could be a case:
|
||||
@@ -45,6 +50,10 @@ pub struct ComputeNode {
|
||||
pub state: Mutex<ComputeState>,
|
||||
/// `Condvar` to allow notifying waiters about state changes.
|
||||
pub state_changed: Condvar,
|
||||
/// S3 extensions configuration variables
|
||||
pub ext_remote_storage: Option<GenericRemoteStorage>,
|
||||
pub available_libraries: OnceLock<HashMap<String, RemotePath>>,
|
||||
pub available_extensions: OnceLock<HashMap<String, Vec<RemotePath>>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -133,6 +142,84 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
|
||||
}
|
||||
}
|
||||
|
||||
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
|
||||
/// that we give to customers
|
||||
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let roles = spec
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| format!("'{}'", escape_literal(&r.name)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dbs = spec
|
||||
.cluster
|
||||
.databases
|
||||
.iter()
|
||||
.map(|db| format!("'{}'", escape_literal(&db.name)))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let roles_decl = if roles.is_empty() {
|
||||
String::from("roles text[] := NULL;")
|
||||
} else {
|
||||
format!(
|
||||
r#"
|
||||
roles text[] := ARRAY(SELECT rolname
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE rolname IN ({}));"#,
|
||||
roles.join(", ")
|
||||
)
|
||||
};
|
||||
|
||||
let database_decl = if dbs.is_empty() {
|
||||
String::from("dbs text[] := NULL;")
|
||||
} else {
|
||||
format!(
|
||||
r#"
|
||||
dbs text[] := ARRAY(SELECT datname
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname IN ({}));"#,
|
||||
dbs.join(", ")
|
||||
)
|
||||
};
|
||||
|
||||
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
|
||||
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
|
||||
let query = format!(
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
r text;
|
||||
{}
|
||||
{}
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
IF array_length(roles, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT neon_superuser TO %s',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||
FOREACH r IN ARRAY roles LOOP
|
||||
EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
|
||||
END LOOP;
|
||||
END IF;
|
||||
IF array_length(dbs, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
roles_decl, database_decl,
|
||||
);
|
||||
info!("Neon superuser created:\n{}", &query);
|
||||
client
|
||||
.simple_query(&query)
|
||||
.map_err(|e| anyhow::anyhow!(e).context(query))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
let mut state = self.state.lock().unwrap();
|
||||
@@ -245,14 +332,22 @@ impl ComputeNode {
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
#[instrument(skip(self, compute_state))]
|
||||
pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
pub fn prepare_pgdata(
|
||||
&self,
|
||||
compute_state: &ComputeState,
|
||||
extension_server_port: u16,
|
||||
) -> Result<()> {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let spec = &pspec.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?;
|
||||
config::write_postgres_conf(
|
||||
&pgdata_path.join("postgresql.conf"),
|
||||
&pspec.spec,
|
||||
Some(extension_server_port),
|
||||
)?;
|
||||
|
||||
// Syncing safekeepers is only safe with primary nodes: if a primary
|
||||
// is already connected it will be kicked out, so a secondary (standby)
|
||||
@@ -347,6 +442,8 @@ impl ComputeNode {
|
||||
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||
|
||||
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||
// Disable forwarding so that users don't get a cloud_admin role
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
@@ -357,24 +454,21 @@ impl ComputeNode {
|
||||
Ok(client) => client,
|
||||
};
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||
create_neon_superuser(spec, &mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(spec, self.connstr.as_str())?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -395,7 +489,7 @@ impl ComputeNode {
|
||||
|
||||
// Write new config
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
|
||||
|
||||
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
|
||||
self.pg_reload_conf(&mut client)?;
|
||||
@@ -407,7 +501,7 @@ impl ComputeNode {
|
||||
handle_roles(&spec, &mut client)?;
|
||||
handle_databases(&spec, &mut client)?;
|
||||
handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(&spec, self.connstr.as_str(), &mut client)?;
|
||||
handle_grants(&spec, self.connstr.as_str())?;
|
||||
handle_extensions(&spec, &mut client)?;
|
||||
}
|
||||
|
||||
@@ -425,24 +519,26 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub fn start_compute(&self) -> Result<std::process::Child> {
|
||||
pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
|
||||
let compute_state = self.state.lock().unwrap().clone();
|
||||
let spec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||
spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
spec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
spec.tenant_id,
|
||||
spec.timeline_id,
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
|
||||
pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
|
||||
pspec.tenant_id,
|
||||
pspec.timeline_id,
|
||||
);
|
||||
|
||||
self.prepare_pgdata(&compute_state)?;
|
||||
self.prepare_external_extensions(&compute_state)?;
|
||||
|
||||
self.prepare_pgdata(&compute_state, extension_server_port)?;
|
||||
|
||||
let start_time = Utc::now();
|
||||
|
||||
let pg = self.start_postgres(spec.storage_auth_token.clone())?;
|
||||
let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
|
||||
|
||||
if spec.spec.mode == ComputeMode::Primary {
|
||||
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
|
||||
self.apply_config(&compute_state)?;
|
||||
}
|
||||
|
||||
@@ -462,6 +558,11 @@ impl ComputeNode {
|
||||
}
|
||||
self.set_status(ComputeStatus::Running);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
|
||||
);
|
||||
|
||||
Ok(pg)
|
||||
}
|
||||
|
||||
@@ -567,4 +668,131 @@ LIMIT 100",
|
||||
"{{\"pg_stat_statements\": []}}".to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// If remote extension storage is configured,
|
||||
// download extension control files
|
||||
// and shared preload libraries.
|
||||
#[tokio::main]
|
||||
pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> {
|
||||
if let Some(ref ext_remote_storage) = self.ext_remote_storage {
|
||||
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
|
||||
// download preload shared libraries before postgres start (if any)
|
||||
let spec = &pspec.spec;
|
||||
|
||||
// 1. parse private extension paths from spec
|
||||
let private_ext_prefixes = match &spec.private_extensions {
|
||||
Some(private_extensions) => private_extensions.clone(),
|
||||
None => Vec::new(),
|
||||
};
|
||||
|
||||
info!("private_ext_prefixes: {:?}", &private_ext_prefixes);
|
||||
|
||||
// 2. parse shared_preload_libraries from spec
|
||||
let mut libs_vec = Vec::new();
|
||||
|
||||
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
|
||||
libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
|
||||
info!(
|
||||
"shared_preload_libraries parsed from spec.cluster.settings: {:?}",
|
||||
libs_vec
|
||||
);
|
||||
|
||||
// also parse shared_preload_libraries from provided postgresql.conf
|
||||
// that is used in neon_local and python tests
|
||||
if let Some(conf) = &spec.cluster.postgresql_conf {
|
||||
let conf_lines = conf.split('\n').collect::<Vec<&str>>();
|
||||
|
||||
let mut shared_preload_libraries_line = "";
|
||||
for line in conf_lines {
|
||||
if line.starts_with("shared_preload_libraries") {
|
||||
shared_preload_libraries_line = line;
|
||||
}
|
||||
}
|
||||
|
||||
let mut preload_libs_vec = Vec::new();
|
||||
if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) {
|
||||
preload_libs_vec = libs
|
||||
.split(&[',', '\'', ' '])
|
||||
.filter(|s| *s != "neon" && !s.is_empty())
|
||||
.map(str::to_string)
|
||||
.collect();
|
||||
}
|
||||
|
||||
info!(
|
||||
"shared_preload_libraries parsed from spec.cluster.postgresql_conf: {:?}",
|
||||
preload_libs_vec
|
||||
);
|
||||
|
||||
libs_vec.extend(preload_libs_vec);
|
||||
}
|
||||
|
||||
// download extension control files & shared_preload_libraries
|
||||
|
||||
let available_extensions = extension_server::get_available_extensions(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&private_ext_prefixes,
|
||||
)
|
||||
.await?;
|
||||
self.available_extensions
|
||||
.set(available_extensions)
|
||||
.expect("available_extensions.set error");
|
||||
|
||||
info!("Libraries to download: {:?}", &libs_vec);
|
||||
let available_libraries = extension_server::get_available_libraries(
|
||||
ext_remote_storage,
|
||||
&self.pgbin,
|
||||
&self.pgversion,
|
||||
&private_ext_prefixes,
|
||||
&libs_vec,
|
||||
)
|
||||
.await?;
|
||||
self.available_libraries
|
||||
.set(available_libraries)
|
||||
.expect("available_libraries.set error");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn download_extension_sql_files(&self, filename: String) -> Result<()> {
|
||||
match &self.ext_remote_storage {
|
||||
None => anyhow::bail!("No remote extension storage"),
|
||||
Some(remote_storage) => {
|
||||
extension_server::download_extension_sql_files(
|
||||
&filename,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
self.available_extensions
|
||||
.get()
|
||||
.context("available_extensions broke")?,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download_library_file(&self, filename: String) -> Result<()> {
|
||||
match &self.ext_remote_storage {
|
||||
None => anyhow::bail!("No remote extension storage"),
|
||||
Some(remote_storage) => {
|
||||
extension_server::download_library_file(
|
||||
&filename,
|
||||
remote_storage,
|
||||
&self.pgbin,
|
||||
self.available_libraries
|
||||
.get()
|
||||
.context("available_libraries broke")?,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
pub fn write_postgres_conf(
|
||||
path: &Path,
|
||||
spec: &ComputeSpec,
|
||||
extension_server_port: Option<u16>,
|
||||
) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut file = File::create(path)?;
|
||||
|
||||
@@ -95,5 +99,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
}
|
||||
|
||||
if let Some(port) = extension_server_port {
|
||||
writeln!(file, "neon.extension_server_port={}", port)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
379
compute_tools/src/extension_server.rs
Normal file
379
compute_tools/src/extension_server.rs
Normal file
@@ -0,0 +1,379 @@
|
||||
// Download extension files from the extension store
|
||||
// and put them in the right place in the postgres directory
|
||||
use anyhow::{self, bail, Context, Result};
|
||||
use remote_storage::*;
|
||||
use serde_json::{self, Value};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::info;
|
||||
|
||||
const SHARE_EXT_PATH: &str = "share/postgresql/extension";
|
||||
|
||||
fn get_pg_config(argument: &str, pgbin: &str) -> String {
|
||||
// gives the result of `pg_config [argument]`
|
||||
// where argument is a flag like `--version` or `--sharedir`
|
||||
let pgconfig = pgbin.replace("postgres", "pg_config");
|
||||
let config_output = std::process::Command::new(pgconfig)
|
||||
.arg(argument)
|
||||
.output()
|
||||
.expect("pg_config error");
|
||||
std::str::from_utf8(&config_output.stdout)
|
||||
.expect("pg_config error")
|
||||
.trim()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
pub fn get_pg_version(pgbin: &str) -> String {
|
||||
// pg_config --version returns a (platform specific) human readable string
|
||||
// such as "PostgreSQL 15.4". We parse this to v14/v15
|
||||
let human_version = get_pg_config("--version", pgbin);
|
||||
if human_version.contains("15") {
|
||||
return "v15".to_string();
|
||||
} else if human_version.contains("14") {
|
||||
return "v14".to_string();
|
||||
}
|
||||
panic!("Unsuported postgres version {human_version}");
|
||||
}
|
||||
|
||||
async fn download_helper(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
remote_from_path: &RemotePath,
|
||||
remote_from_prefix: Option<&Path>,
|
||||
download_location: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
// downloads file at remote_from_path to download_location/[file_name]
|
||||
|
||||
// we cannot use remote_from_path.object_name() here
|
||||
// because extension files can be in subdirectories of the extension store.
|
||||
//
|
||||
// To handle this, we use remote_from_prefix to strip the prefix from the path
|
||||
// this gives us the relative path of the file in the extension store,
|
||||
// and we use this relative path to construct the local path.
|
||||
//
|
||||
let local_path = match remote_from_prefix {
|
||||
Some(prefix) => {
|
||||
let p = remote_from_path
|
||||
.get_path()
|
||||
.strip_prefix(prefix)
|
||||
.expect("bad prefix");
|
||||
|
||||
download_location.join(p)
|
||||
}
|
||||
None => download_location.join(remote_from_path.object_name().expect("bad object")),
|
||||
};
|
||||
|
||||
if local_path.exists() {
|
||||
info!("File {:?} already exists. Skipping download", &local_path);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(
|
||||
"Downloading {:?} to location {:?}",
|
||||
&remote_from_path, &local_path
|
||||
);
|
||||
let mut download = remote_storage.download(remote_from_path).await?;
|
||||
let mut write_data_buffer = Vec::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_end(&mut write_data_buffer)
|
||||
.await?;
|
||||
if remote_from_prefix.is_some() {
|
||||
if let Some(prefix) = local_path.parent() {
|
||||
info!(
|
||||
"Downloading file with prefix. Create directory {:?}",
|
||||
prefix
|
||||
);
|
||||
// if directory already exists, this is a no-op
|
||||
std::fs::create_dir_all(prefix)?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_file = BufWriter::new(File::create(local_path)?);
|
||||
output_file.write_all(&write_data_buffer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// download extension control files
|
||||
//
|
||||
// if private_ext_prefixes is provided - search also in private extension paths
|
||||
//
|
||||
pub async fn get_available_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
private_ext_prefixes: &Vec<String>,
|
||||
) -> anyhow::Result<HashMap<String, Vec<RemotePath>>> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
|
||||
let mut paths: Vec<RemotePath> = Vec::new();
|
||||
// public extensions
|
||||
paths.push(RemotePath::new(
|
||||
&Path::new(pg_version).join(SHARE_EXT_PATH),
|
||||
)?);
|
||||
// private extensions
|
||||
for private_prefix in private_ext_prefixes {
|
||||
paths.push(RemotePath::new(
|
||||
&Path::new(pg_version)
|
||||
.join(private_prefix)
|
||||
.join(SHARE_EXT_PATH),
|
||||
)?);
|
||||
}
|
||||
|
||||
let all_available_files = list_files_in_prefixes_for_extensions(remote_storage, &paths).await?;
|
||||
|
||||
info!(
|
||||
"list of available_extension files {:?}",
|
||||
&all_available_files
|
||||
);
|
||||
|
||||
// download all control files
|
||||
for (obj_name, obj_paths) in &all_available_files {
|
||||
for obj_path in obj_paths {
|
||||
if obj_name.ends_with("control") {
|
||||
download_helper(remote_storage, obj_path, None, &local_sharedir).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_available_files)
|
||||
}
|
||||
|
||||
// Download requested shared_preload_libraries
|
||||
//
|
||||
// Note that tenant_id is not optional here, because we only download libraries
|
||||
// after we know the tenant spec and the tenant_id.
|
||||
//
|
||||
// return list of all library files to use it in the future searches
|
||||
pub async fn get_available_libraries(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
pg_version: &str,
|
||||
private_ext_prefixes: &Vec<String>,
|
||||
preload_libraries: &Vec<String>,
|
||||
) -> anyhow::Result<HashMap<String, RemotePath>> {
|
||||
let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
|
||||
// Construct a hashmap of all available libraries
|
||||
// example (key, value) pair: test_lib0.so, v14/lib/test_lib0.so
|
||||
|
||||
let mut paths: Vec<RemotePath> = Vec::new();
|
||||
// public libraries
|
||||
paths.push(
|
||||
RemotePath::new(&Path::new(&pg_version).join("lib/"))
|
||||
.expect("The hard coded path here is valid"),
|
||||
);
|
||||
// private libraries
|
||||
for private_prefix in private_ext_prefixes {
|
||||
paths.push(
|
||||
RemotePath::new(&Path::new(&pg_version).join(private_prefix).join("lib"))
|
||||
.expect("The hard coded path here is valid"),
|
||||
);
|
||||
}
|
||||
|
||||
let all_available_libraries = list_files_in_prefixes(remote_storage, &paths).await?;
|
||||
|
||||
info!("list of library files {:?}", &all_available_libraries);
|
||||
|
||||
// download all requested libraries
|
||||
for lib_name in preload_libraries {
|
||||
// add file extension if it isn't in the filename
|
||||
let lib_name_with_ext = enforce_so_end(lib_name);
|
||||
info!("looking for library {:?}", &lib_name_with_ext);
|
||||
|
||||
match all_available_libraries.get(&*lib_name_with_ext) {
|
||||
Some(remote_path) => {
|
||||
download_helper(remote_storage, remote_path, None, &local_libdir).await?
|
||||
}
|
||||
None => {
|
||||
let file_path = local_libdir.join(&lib_name_with_ext);
|
||||
if file_path.exists() {
|
||||
info!("File {:?} already exists. Skipping download", &file_path);
|
||||
} else {
|
||||
bail!("Shared library file {lib_name} is not found in the extension store")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(all_available_libraries)
|
||||
}
|
||||
|
||||
// download all sqlfiles (and possibly data files) for a given extension name
|
||||
//
|
||||
pub async fn download_extension_sql_files(
|
||||
ext_name: &str,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
all_available_files: &HashMap<String, Vec<RemotePath>>,
|
||||
) -> Result<()> {
|
||||
let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension");
|
||||
let mut downloaded_something = false;
|
||||
|
||||
if let Some(files) = all_available_files.get(ext_name) {
|
||||
for file in files {
|
||||
if file.extension().context("bad file name")? != "control" {
|
||||
// find files prefix to handle cases when extension files are stored
|
||||
// in a directory with the same name as the extension
|
||||
// example:
|
||||
// share/postgresql/extension/extension_name/extension_name--1.0.sql
|
||||
let index = file
|
||||
.get_path()
|
||||
.to_str()
|
||||
.context("invalid path")?
|
||||
.find(ext_name)
|
||||
.context("invalid path")?;
|
||||
|
||||
let prefix_str =
|
||||
file.get_path().to_str().context("invalid path")?[..index].to_string();
|
||||
let remote_from_prefix = if prefix_str.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Path::new(&prefix_str))
|
||||
};
|
||||
|
||||
download_helper(remote_storage, file, remote_from_prefix, &local_sharedir).await?;
|
||||
downloaded_something = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !downloaded_something {
|
||||
bail!("Files for extension {ext_name} are not found in the extension store");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// appends an .so suffix to libname if it does not already have one
|
||||
fn enforce_so_end(libname: &str) -> String {
|
||||
if !libname.ends_with(".so") {
|
||||
format!("{}.so", libname)
|
||||
} else {
|
||||
libname.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
// download shared library file
|
||||
pub async fn download_library_file(
|
||||
lib_name: &str,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
pgbin: &str,
|
||||
all_available_libraries: &HashMap<String, RemotePath>,
|
||||
) -> Result<()> {
|
||||
let local_libdir: PathBuf = Path::new(&get_pg_config("--pkglibdir", pgbin)).into();
|
||||
let lib_name_with_ext = enforce_so_end(lib_name);
|
||||
info!("looking for library {:?}", &lib_name_with_ext);
|
||||
match all_available_libraries.get(&*lib_name_with_ext) {
|
||||
Some(remote_path) => {
|
||||
download_helper(remote_storage, remote_path, None, &local_libdir).await?
|
||||
}
|
||||
None => bail!("Shared library file {lib_name} is not found in the extension store"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
|
||||
let remote_ext_config: serde_json::Value = serde_json::from_str(remote_ext_config)?;
|
||||
let remote_ext_bucket = match &remote_ext_config["bucket"] {
|
||||
Value::String(x) => x,
|
||||
_ => bail!("remote_ext_config missing bucket"),
|
||||
};
|
||||
let remote_ext_region = match &remote_ext_config["region"] {
|
||||
Value::String(x) => x,
|
||||
_ => bail!("remote_ext_config missing region"),
|
||||
};
|
||||
let remote_ext_endpoint = match &remote_ext_config["endpoint"] {
|
||||
Value::String(x) => Some(x.clone()),
|
||||
_ => None,
|
||||
};
|
||||
let remote_ext_prefix = match &remote_ext_config["prefix"] {
|
||||
Value::String(x) => Some(x.clone()),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// load will not be large, so default parameters are fine
|
||||
let config = S3Config {
|
||||
bucket_name: remote_ext_bucket.to_string(),
|
||||
bucket_region: remote_ext_region.to_string(),
|
||||
prefix_in_bucket: remote_ext_prefix,
|
||||
endpoint: remote_ext_endpoint,
|
||||
concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_keys_per_list_response: None,
|
||||
};
|
||||
let config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
|
||||
max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
|
||||
storage: RemoteStorageKind::AwsS3(config),
|
||||
};
|
||||
GenericRemoteStorage::from_config(&config)
|
||||
}
|
||||
|
||||
// helper to collect all files in the given prefixes
|
||||
// returns hashmap of (file_name, file_remote_path)
|
||||
async fn list_files_in_prefixes(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
paths: &Vec<RemotePath>,
|
||||
) -> Result<HashMap<String, RemotePath>> {
|
||||
let mut res = HashMap::new();
|
||||
|
||||
for path in paths {
|
||||
for file in remote_storage.list_files(Some(path)).await? {
|
||||
res.insert(
|
||||
file.object_name().expect("bad object").to_owned(),
|
||||
file.to_owned(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
// helper to extract extension name
|
||||
// extension files can be in subdirectories of the extension store.
|
||||
// examples of layout:
|
||||
//
|
||||
// share/postgresql/extension/extension_name--1.0.sql
|
||||
//
|
||||
// or
|
||||
//
|
||||
// share/postgresql/extension/extension_name/extension_name--1.0.sql
|
||||
// share/postgresql/extension/extension_name/extra_data.csv
|
||||
//
|
||||
// Note: we **assume** that the extension files is in one of these formats.
|
||||
// If it is not, this code will not download it.
|
||||
fn get_ext_name(path: &str) -> Result<&str> {
|
||||
let path_suffix: Vec<&str> = path.split(&format!("{SHARE_EXT_PATH}/")).collect();
|
||||
|
||||
let path_suffix = path_suffix.last().expect("bad ext name");
|
||||
// the order of these is important
|
||||
// otherwise we'll return incorrect extension name
|
||||
// for path like share/postgresql/extension/extension_name/extension_name--1.0.sql
|
||||
for index in ["/", "--"] {
|
||||
if let Some(index) = path_suffix.find(index) {
|
||||
return Ok(&path_suffix[..index]);
|
||||
}
|
||||
}
|
||||
Ok(path_suffix)
|
||||
}
|
||||
|
||||
// helper to collect files of given prefixes for extensions
|
||||
// and group them by extension
|
||||
// returns a hashmap of (extension_name, Vector of remote paths for all files needed for this extension)
|
||||
async fn list_files_in_prefixes_for_extensions(
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
paths: &Vec<RemotePath>,
|
||||
) -> Result<HashMap<String, Vec<RemotePath>>> {
|
||||
let mut result = HashMap::new();
|
||||
for path in paths {
|
||||
for file in remote_storage.list_files(Some(path)).await? {
|
||||
let file_ext_name = get_ext_name(file.get_path().to_str().context("invalid path")?)?;
|
||||
let ext_file_list = result
|
||||
.entry(file_ext_name.to_string())
|
||||
.or_insert(Vec::new());
|
||||
ext_file_list.push(file.to_owned());
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
@@ -121,8 +121,62 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
|
||||
}
|
||||
}
|
||||
|
||||
// download extension files from S3 on demand
|
||||
(&Method::POST, route) if route.starts_with("/extension_server/") => {
|
||||
info!("serving {:?} POST request", route);
|
||||
info!("req.uri {:?}", req.uri());
|
||||
|
||||
let mut is_library = false;
|
||||
|
||||
if let Some(params) = req.uri().query() {
|
||||
info!("serving {:?} POST request with params: {}", route, params);
|
||||
|
||||
if params == "is_library=true" {
|
||||
is_library = true;
|
||||
} else {
|
||||
let mut resp = Response::new(Body::from("Wrong request parameters"));
|
||||
*resp.status_mut() = StatusCode::BAD_REQUEST;
|
||||
return resp;
|
||||
}
|
||||
}
|
||||
|
||||
let filename = route.split('/').last().unwrap().to_string();
|
||||
|
||||
info!(
|
||||
"serving /extension_server POST request, filename: {:?} is_library: {}",
|
||||
filename, is_library
|
||||
);
|
||||
|
||||
if is_library {
|
||||
match compute.download_library_file(filename.to_string()).await {
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("library download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match compute
|
||||
.download_extension_sql_files(filename.to_string())
|
||||
.await
|
||||
{
|
||||
Ok(_) => Response::new(Body::from("OK")),
|
||||
Err(e) => {
|
||||
error!("extension download failed: {}", e);
|
||||
let mut resp = Response::new(Body::from(e.to_string()));
|
||||
*resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR;
|
||||
resp
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
method => {
|
||||
info!("404 Not Found for {:?}", method);
|
||||
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
|
||||
@@ -139,6 +139,34 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
/extension_server:
|
||||
post:
|
||||
tags:
|
||||
- Extension
|
||||
summary: Download extension from S3 to local folder.
|
||||
description: ""
|
||||
operationId: downloadExtension
|
||||
responses:
|
||||
200:
|
||||
description: Extension downloaded
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'OK' if download succeeded.
|
||||
example: "OK"
|
||||
400:
|
||||
description: Request is invalid.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
500:
|
||||
description: Extension download request failed.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/GenericError"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -9,6 +9,7 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod extension_server;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -17,7 +17,7 @@ use compute_api::spec::{Database, GenericOption, GenericOptions, PgIdent, Role};
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Escape a string for including it in a SQL literal
|
||||
fn escape_literal(s: &str) -> String {
|
||||
pub fn escape_literal(s: &str) -> String {
|
||||
s.replace('\'', "''").replace('\\', "\\\\")
|
||||
}
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane(
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
|
||||
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
@@ -269,17 +269,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
RoleAction::Create => {
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
|
||||
let mut query: String = format!(
|
||||
"CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
info!("role create query: '{}'", &query);
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
|
||||
let grant_query = format!(
|
||||
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||
name.pg_quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
info!("role grant query: '{}'", &grant_query);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -476,6 +472,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
query.push_str(&db.to_pg_options());
|
||||
let _guard = info_span!("executing", query).entered();
|
||||
client.execute(query.as_str(), &[])?;
|
||||
let grant_query: String = format!(
|
||||
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
|
||||
name.pg_quote()
|
||||
);
|
||||
client.execute(grant_query.as_str(), &[])?;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -495,35 +496,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||
#[instrument(skip_all)]
|
||||
pub fn handle_grants(spec: &ComputeSpec, connstr: &str, client: &mut Client) -> Result<()> {
|
||||
pub fn handle_grants(spec: &ComputeSpec, connstr: &str) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
// We now have a separate `web_access` role to connect to the database
|
||||
// via the web interface and proxy link auth. And also we grant a
|
||||
// read / write all data privilege to every role. So also grant
|
||||
// create to everyone.
|
||||
// XXX: later we should stop messing with Postgres ACL in such horrible
|
||||
// ways.
|
||||
let roles = spec
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| r.name.pg_quote())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
let dbname = &db.name;
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.pg_quote(),
|
||||
roles.join(", ")
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||
// atomically.
|
||||
|
||||
@@ -32,3 +32,4 @@ utils.workspace = true
|
||||
|
||||
compute_api.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -657,6 +657,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
.get_one::<String>("endpoint_id")
|
||||
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
|
||||
|
||||
let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
|
||||
|
||||
// If --safekeepers argument is given, use only the listed safekeeper nodes.
|
||||
let safekeepers =
|
||||
if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
|
||||
@@ -698,7 +700,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
_ => {}
|
||||
}
|
||||
println!("Starting existing endpoint {endpoint_id}...");
|
||||
endpoint.start(&auth_token, safekeepers)?;
|
||||
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
} else {
|
||||
let branch_name = sub_args
|
||||
.get_one::<String>("branch-name")
|
||||
@@ -742,7 +744,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
|
||||
pg_version,
|
||||
mode,
|
||||
)?;
|
||||
ep.start(&auth_token, safekeepers)?;
|
||||
ep.start(&auth_token, safekeepers, remote_ext_config)?;
|
||||
}
|
||||
}
|
||||
"stop" => {
|
||||
@@ -1002,6 +1004,12 @@ fn cli() -> Command {
|
||||
.help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
|
||||
.required(false);
|
||||
|
||||
let remote_ext_config_args = Arg::new("remote-ext-config")
|
||||
.long("remote-ext-config")
|
||||
.num_args(1)
|
||||
.help("Configure the S3 bucket that we search for extensions in.")
|
||||
.required(false);
|
||||
|
||||
let lsn_arg = Arg::new("lsn")
|
||||
.long("lsn")
|
||||
.help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.")
|
||||
@@ -1152,6 +1160,7 @@ fn cli() -> Command {
|
||||
.arg(pg_version_arg)
|
||||
.arg(hot_standby_arg)
|
||||
.arg(safekeepers_arg)
|
||||
.arg(remote_ext_config_args)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -67,6 +67,7 @@ pub struct EndpointConf {
|
||||
pg_port: u16,
|
||||
http_port: u16,
|
||||
pg_version: u32,
|
||||
skip_pg_catalog_updates: bool,
|
||||
}
|
||||
|
||||
//
|
||||
@@ -135,6 +136,7 @@ impl ComputeControlPlane {
|
||||
mode,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: false,
|
||||
});
|
||||
|
||||
ep.create_endpoint_dir()?;
|
||||
@@ -148,6 +150,7 @@ impl ComputeControlPlane {
|
||||
http_port,
|
||||
pg_port,
|
||||
pg_version,
|
||||
skip_pg_catalog_updates: false,
|
||||
})?,
|
||||
)?;
|
||||
std::fs::write(
|
||||
@@ -183,6 +186,9 @@ pub struct Endpoint {
|
||||
// the endpoint runs in.
|
||||
pub env: LocalEnv,
|
||||
pageserver: Arc<PageServerNode>,
|
||||
|
||||
// Optimizations
|
||||
skip_pg_catalog_updates: bool,
|
||||
}
|
||||
|
||||
impl Endpoint {
|
||||
@@ -216,6 +222,7 @@ impl Endpoint {
|
||||
mode: conf.mode,
|
||||
tenant_id: conf.tenant_id,
|
||||
pg_version: conf.pg_version,
|
||||
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -304,7 +311,7 @@ impl Endpoint {
|
||||
|
||||
// TODO: use future host field from safekeeper spec
|
||||
// Pass the list of safekeepers to the replica so that it can connect to any of them,
|
||||
// whichever is availiable.
|
||||
// whichever is available.
|
||||
let sk_ports = self
|
||||
.env
|
||||
.safekeepers
|
||||
@@ -401,7 +408,12 @@ impl Endpoint {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start(&self, auth_token: &Option<String>, safekeepers: Vec<NodeId>) -> Result<()> {
|
||||
pub fn start(
|
||||
&self,
|
||||
auth_token: &Option<String>,
|
||||
safekeepers: Vec<NodeId>,
|
||||
remote_ext_config: Option<&String>,
|
||||
) -> Result<()> {
|
||||
if self.status() == "running" {
|
||||
anyhow::bail!("The endpoint is already running");
|
||||
}
|
||||
@@ -450,6 +462,7 @@ impl Endpoint {
|
||||
|
||||
// Create spec file
|
||||
let spec = ComputeSpec {
|
||||
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
|
||||
format_version: 1.0,
|
||||
operation_uuid: None,
|
||||
cluster: Cluster {
|
||||
@@ -468,6 +481,7 @@ impl Endpoint {
|
||||
pageserver_connstring: Some(pageserver_connstring),
|
||||
safekeeper_connstrings,
|
||||
storage_auth_token: auth_token.clone(),
|
||||
private_extensions: Some(vec![self.tenant_id.to_string()]), //DEBUG ONLY
|
||||
};
|
||||
let spec_path = self.endpoint_path().join("spec.json");
|
||||
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
|
||||
@@ -499,6 +513,9 @@ impl Endpoint {
|
||||
.stdin(std::process::Stdio::null())
|
||||
.stderr(logfile.try_clone()?)
|
||||
.stdout(logfile);
|
||||
if let Some(remote_ext_config) = remote_ext_config {
|
||||
cmd.args(["--remote-ext-config", remote_ext_config]);
|
||||
}
|
||||
let _child = cmd.spawn()?;
|
||||
|
||||
// Wait for it to start
|
||||
|
||||
301
docs/rfcs/024-extension-loading.md
Normal file
301
docs/rfcs/024-extension-loading.md
Normal file
@@ -0,0 +1,301 @@
|
||||
# Supporting custom user Extensions
|
||||
|
||||
Created 2023-05-03
|
||||
|
||||
## Motivation
|
||||
|
||||
There are many extensions in the PostgreSQL ecosystem, and not all extensions
|
||||
are of a quality that we can confidently support them. Additionally, our
|
||||
current extension inclusion mechanism has several problems because we build all
|
||||
extensions into the primary Compute image: We build the extensions every time
|
||||
we build the compute image regardless of whether we actually need to rebuild
|
||||
the image, and the inclusion of these extensions in the image adds a hard
|
||||
dependency on all supported extensions - thus increasing the image size, and
|
||||
with it the time it takes to download that image - increasing first start
|
||||
latency.
|
||||
|
||||
This RFC proposes a dynamic loading mechanism that solves most of these
|
||||
problems.
|
||||
|
||||
## Summary
|
||||
|
||||
`compute_ctl` is made responsible for loading extensions on-demand into
|
||||
the container's file system for dynamically loaded extensions, and will also
|
||||
make sure that the extensions in `shared_preload_libraries` are downloaded
|
||||
before the compute node starts.
|
||||
|
||||
## Components
|
||||
|
||||
compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store
|
||||
|
||||
## Requirements
|
||||
|
||||
Compute nodes with no extra extensions should not be negatively impacted by
|
||||
the existence of support for many extensions.
|
||||
|
||||
Installing an extension into PostgreSQL should be easy.
|
||||
|
||||
Non-preloaded extensions shouldn't impact startup latency.
|
||||
|
||||
Uninstalled extensions shouldn't impact query latency.
|
||||
|
||||
A small latency penalty for dynamically loaded extensions is acceptable in
|
||||
the first seconds of compute startup, but not in steady-state operations.
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
### On-demand, JIT-loading of extensions
|
||||
|
||||
TLDR; we download extensions as soon as we need them, or when we have spare
|
||||
time.
|
||||
|
||||
That means, we first download the extensions required to start the PostMaster
|
||||
(`shared_preload_libraries` and their dependencies), then the libraries required
|
||||
before a backend can start processing user input (`preload_libraries` and
|
||||
dependencies), and then (with network limits applied) the remainder of the
|
||||
configured extensions, with prioritization for installed extensions.
|
||||
|
||||
If PostgreSQL tries to load a library that is not yet fully on disk, it will
|
||||
ask `compute_ctl` first if the extension has been downloaded yet, and will wait
|
||||
for `compute_ctl` to finish downloading that extension. `compute_ctl` will
|
||||
prioritize downloading that extension over other extensions that were not yet
|
||||
requested.
|
||||
|
||||
#### Workflow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
autonumber
|
||||
participant EX as External (control plane, ...)
|
||||
participant CTL as compute_ctl
|
||||
participant ST as extension store
|
||||
actor PG as PostgreSQL
|
||||
|
||||
EX ->>+ CTL: Start compute with config X
|
||||
|
||||
note over CTL: The configuration contains a list of all <br/>extensions available to that compute node, etc.
|
||||
|
||||
par Optionally parallel or concurrent
|
||||
loop Available extensions
|
||||
CTL ->>+ ST: Download control file of extension
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading control file
|
||||
CTL ->>- CTL: Put control file in extensions directory
|
||||
end
|
||||
|
||||
loop For each extension in shared_preload_libraries
|
||||
CTL ->>+ ST: Download extension's data
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->>- CTL: Put extension's files in the right place
|
||||
end
|
||||
end
|
||||
|
||||
CTL ->>+ PG: Start PostgreSQL
|
||||
|
||||
note over CTL: PostgreSQL can now start accepting <br/>connections. However, users may still need to wait <br/>for preload_libraries extensions to get downloaded.
|
||||
|
||||
par Load preload_libraries
|
||||
loop For each extension in preload_libraries
|
||||
CTL ->>+ ST: Download extension's data
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->>- CTL: Put extension's files in the right place
|
||||
end
|
||||
end
|
||||
|
||||
note over CTL: After this, connections don't have any hard <br/>waits for extension files left, except for those <br/>connections that override preload_libraries <br/>in their startup packet
|
||||
|
||||
par PG's internal_load_library(library)
|
||||
alt Library is not yet loaded
|
||||
PG ->>+ CTL: Load library X
|
||||
CTL ->>+ ST: Download the extension that provides X
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->> CTL: Put extension's files in the right place
|
||||
CTL ->>- PG: Ready
|
||||
else Library is already loaded
|
||||
note over PG: No-op
|
||||
end
|
||||
and Download all remaining extensions
|
||||
loop Extension X
|
||||
CTL ->>+ ST: Download not-yet-downloaded extension X
|
||||
activate CTL
|
||||
ST ->>- CTL: Finish downloading
|
||||
CTL ->>- CTL: Put extension's files in the right place
|
||||
end
|
||||
end
|
||||
|
||||
deactivate PG
|
||||
deactivate CTL
|
||||
```
|
||||
|
||||
#### Summary
|
||||
|
||||
Pros:
|
||||
- Startup is only as slow as it takes to load all (shared_)preload_libraries
|
||||
- Supports BYO Extension
|
||||
|
||||
Cons:
|
||||
- O(sizeof(extensions)) IO requirement for loading all extensions.
|
||||
|
||||
### Alternative solutions
|
||||
|
||||
1. Allow users to add their extensions to the base image
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
|
||||
Cons:
|
||||
- Doesn't scale - first start size is dependent on image size;
|
||||
- All extensions are shared across all users: It doesn't allow users to
|
||||
bring their own restrictive-licensed extensions
|
||||
|
||||
2. Bring Your Own compute image
|
||||
|
||||
Pros:
|
||||
- Still easy to deploy
|
||||
- User can bring own patched version of PostgreSQL
|
||||
|
||||
Cons:
|
||||
- First start latency is O(sizeof(extensions image))
|
||||
- Warm instance pool for skipping pod schedule latency is not feasible with
|
||||
O(n) custom images
|
||||
- Support channels are difficult to manage
|
||||
|
||||
3. Download all user extensions in bulk on compute start
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues for "clean" users.
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- Downloading all extensions in advance takes a lot of time, thus startup
|
||||
latency issues
|
||||
|
||||
4. Store user's extensions in persistent storage
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- No startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- EC2 instances have only limited number of attachments shared between EBS
|
||||
volumes, direct-attached NVMe drives, and ENIs.
|
||||
- Compute instance migration isn't trivially solved for EBS mounts (e.g.
|
||||
the device is unavailable whilst moving the mount between instances).
|
||||
- EBS can only mount on one instance at a time (except the expensive IO2
|
||||
device type).
|
||||
|
||||
5. Store user's extensions in network drive
|
||||
|
||||
Pros:
|
||||
- Easy to deploy
|
||||
- Few startup latency issues
|
||||
- Warm instance pool for skipping pod schedule latency is possible
|
||||
|
||||
Cons:
|
||||
- We'd need networked drives, and a lot of them, which would store many
|
||||
duplicate extensions.
|
||||
- **UNCHECKED:** Compute instance migration may not work nicely with
|
||||
networked IOs
|
||||
|
||||
|
||||
### Idea extensions
|
||||
|
||||
The extension store does not have to be S3 directly, but could be a Node-local
|
||||
caching service on top of S3. This would reduce the load on the network for
|
||||
popular extensions.
|
||||
|
||||
## Extension Store implementation
|
||||
|
||||
Extension Store in our case is a private S3 bucket.
|
||||
Extensions are stored as tarballs in the bucket. The tarball contains the extension's control file and all the files that the extension needs to run.
|
||||
|
||||
We may also store the control file separately from the tarball to speed up the extension loading.
|
||||
|
||||
`s3://<the-bucket>/extensions/ext-name/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar`
|
||||
|
||||
where `ext-name` is an extension name and `sha-256+1234abcd1234abcd1234abcd1234abcd` is a hash of a specific extension version tarball.
|
||||
|
||||
To ensure security, there is no direct access to the S3 bucket from compute node.
|
||||
|
||||
Control plane forms a list of extensions available to the compute node
|
||||
and forms a short-lived [pre-signed URL](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html)
|
||||
for each extension that is available to the compute node.
|
||||
|
||||
so, `compute_ctl` receives spec in the following format
|
||||
|
||||
```
|
||||
"extensions": [{
|
||||
"meta_format": 1,
|
||||
"extension_name": "postgis",
|
||||
"link": "https://<the-bucket>/extensions/sha-256+1234abcd1234abcd1234abcd1234abcd/bundle.tar?AWSAccessKeyId=1234abcd1234abcd1234abcd1234abcd&Expires=1234567890&Signature=1234abcd1234abcd1234abcd1234abcd",
|
||||
...
|
||||
}]
|
||||
```
|
||||
|
||||
`compute_ctl` then downloads the extension from the link and unpacks it to the right place.
|
||||
|
||||
### How do we handle private extensions?
|
||||
|
||||
Private and public extensions are treated equally from the Extension Store perspective.
|
||||
The only difference is that the private extensions are not listed in the user UI (managed by control plane).
|
||||
|
||||
### How to add new extension to the Extension Store?
|
||||
|
||||
Since we need to verify that the extension is compatible with the compute node and doesn't contain any malicious code,
|
||||
we need to review the extension before adding it to the Extension Store.
|
||||
|
||||
I do not expect that we will have a lot of extensions to review, so we can do it manually for now.
|
||||
|
||||
Some admin UI may be added later to automate this process.
|
||||
|
||||
The list of extensions available to a compute node is stored in the console database.
|
||||
|
||||
### How is the list of available extensions managed?
|
||||
|
||||
We need to add new tables to the console database to store the list of available extensions, their versions and access rights.
|
||||
|
||||
something like this:
|
||||
|
||||
```
|
||||
CREATE TABLE extensions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name VARCHAR(255) NOT NULL,
|
||||
version VARCHAR(255) NOT NULL,
|
||||
hash VARCHAR(255) NOT NULL, // this is the path to the extension in the Extension Store
|
||||
supported_postgres_versions integer[] NOT NULL,
|
||||
is_public BOOLEAN NOT NULL, // public extensions are available to all users
|
||||
is_shared_preload BOOLEAN NOT NULL, // these extensions require postgres restart
|
||||
is_preload BOOLEAN NOT NULL,
|
||||
license VARCHAR(255) NOT NULL,
|
||||
);
|
||||
|
||||
CREATE TABLE user_extensions (
|
||||
user_id INTEGER NOT NULL,
|
||||
extension_id INTEGER NOT NULL,
|
||||
FOREIGN KEY (user_id) REFERENCES users (id),
|
||||
FOREIGN KEY (extension_id) REFERENCES extensions (id)
|
||||
);
|
||||
```
|
||||
|
||||
When new extension is added to the Extension Store, we add a new record to the table and set permissions.
|
||||
|
||||
In UI, user may select the extensions that they want to use with their compute node.
|
||||
|
||||
NOTE: Extensions that require postgres restart will not be available until the next compute restart.
|
||||
Also, currently user cannot force postgres restart. We should add this feature later.
|
||||
|
||||
For other extensions, we must communicate updates to `compute_ctl` and they will be downloaded in the background.
|
||||
|
||||
### How can user update the extension?
|
||||
|
||||
User can update the extension by selecting the new version of the extension in the UI.
|
||||
|
||||
### Alternatives
|
||||
|
||||
For extensions written on trusted languages we can also adopt
|
||||
`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase.
|
||||
This will increase the amount supported extensions and decrease the amount of work required to support them.
|
||||
@@ -27,6 +27,12 @@ pub struct ComputeSpec {
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
|
||||
/// An optinal hint that can be passed to speed up startup time if we know
|
||||
/// that no pg catalog mutations (like role creation, database creation,
|
||||
/// extension creation) need to be done on the actual database to start.
|
||||
#[serde(default)] // Default false
|
||||
pub skip_pg_catalog_updates: bool,
|
||||
|
||||
// Information needed to connect to the storage layer.
|
||||
//
|
||||
// `tenant_id`, `timeline_id` and `pageserver_connstring` are always needed.
|
||||
@@ -54,6 +60,8 @@ pub struct ComputeSpec {
|
||||
/// If set, 'storage_auth_token' is used as the password to authenticate to
|
||||
/// the pageserver and safekeepers.
|
||||
pub storage_auth_token: Option<String>,
|
||||
|
||||
pub private_extensions: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -142,4 +150,14 @@ mod tests {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_unknown_fields() {
|
||||
// Forward compatibility test
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
|
||||
let ob = json.as_object_mut().unwrap();
|
||||
ob.insert("unknown_field_123123123".into(), "hello".into());
|
||||
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use prometheus::{Registry, Result};
|
||||
pub mod launch_timestamp;
|
||||
mod wrappers;
|
||||
pub use wrappers::{CountedReader, CountedWriter};
|
||||
pub mod metric_vec_duration;
|
||||
|
||||
pub type UIntGauge = GenericGauge<AtomicU64>;
|
||||
pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
|
||||
|
||||
23
libs/metrics/src/metric_vec_duration.rs
Normal file
23
libs/metrics/src/metric_vec_duration.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
//! Helpers for observing duration on HistogramVec / CounterVec / GaugeVec / MetricVec<T>.
|
||||
|
||||
use std::{future::Future, time::Instant};
|
||||
|
||||
pub trait DurationResultObserver {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration);
|
||||
}
|
||||
|
||||
pub async fn observe_async_block_duration_by_result<
|
||||
T,
|
||||
E,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
O: DurationResultObserver,
|
||||
>(
|
||||
observer: &O,
|
||||
block: F,
|
||||
) -> Result<T, E> {
|
||||
let start = Instant::now();
|
||||
let result = block.await;
|
||||
let duration = start.elapsed();
|
||||
observer.observe_result(&result, duration);
|
||||
result
|
||||
}
|
||||
@@ -70,6 +70,14 @@ impl RemotePath {
|
||||
pub fn join(&self, segment: &Path) -> Self {
|
||||
Self(self.0.join(segment))
|
||||
}
|
||||
|
||||
pub fn get_path(&self) -> &PathBuf {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub fn extension(&self) -> Option<&str> {
|
||||
self.0.extension()?.to_str()
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
@@ -86,6 +94,19 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
prefix: Option<&RemotePath>,
|
||||
) -> Result<Vec<RemotePath>, DownloadError>;
|
||||
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
/// because it is for listing files instead of listing
|
||||
/// names sharing common prefixes.
|
||||
/// For example,
|
||||
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
||||
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
@@ -111,6 +132,8 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
) -> Result<Download, DownloadError>;
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub struct Download {
|
||||
@@ -161,6 +184,20 @@ pub enum GenericRemoteStorage {
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder).await,
|
||||
Self::AwsS3(s) => s.list_files(folder).await,
|
||||
Self::Unreliable(s) => s.list_files(folder).await,
|
||||
}
|
||||
}
|
||||
|
||||
// lists common *prefixes*, if any of files
|
||||
// Example:
|
||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -223,6 +260,14 @@ impl GenericRemoteStorage {
|
||||
Self::Unreliable(s) => s.delete(path).await,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.delete_objects(paths).await,
|
||||
Self::AwsS3(s) => s.delete_objects(paths).await,
|
||||
Self::Unreliable(s) => s.delete_objects(paths).await,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
|
||||
@@ -48,6 +48,14 @@ impl LocalFs {
|
||||
Ok(Self { storage_root })
|
||||
}
|
||||
|
||||
// mirrors S3Bucket::s3_object_to_relative_path
|
||||
fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
|
||||
let relative_path = key
|
||||
.strip_prefix(&self.storage_root)
|
||||
.expect("relative path must contain storage_root as prefix");
|
||||
RemotePath(relative_path.into())
|
||||
}
|
||||
|
||||
async fn read_storage_metadata(
|
||||
&self,
|
||||
file_path: &Path,
|
||||
@@ -132,6 +140,34 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(prefixes)
|
||||
}
|
||||
|
||||
// recursively lists all files in a directory,
|
||||
// mirroring the `list_files` for `s3_bucket`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let full_path = match folder {
|
||||
Some(folder) => folder.with_base(&self.storage_root),
|
||||
None => self.storage_root.clone(),
|
||||
};
|
||||
let mut files = vec![];
|
||||
let mut directory_queue = vec![full_path.clone()];
|
||||
|
||||
while !directory_queue.is_empty() {
|
||||
let cur_folder = directory_queue
|
||||
.pop()
|
||||
.expect("queue cannot be empty: we just checked");
|
||||
let mut entries = fs::read_dir(cur_folder.clone()).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let file_name: PathBuf = entry.file_name().into();
|
||||
let full_file_name = cur_folder.clone().join(&file_name);
|
||||
let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
|
||||
files.push(file_remote_path.clone());
|
||||
if full_file_name.is_dir() {
|
||||
directory_queue.push(full_file_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(files)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
@@ -320,6 +356,13 @@ impl RemoteStorage for LocalFs {
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!(e))?)
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
for path in paths {
|
||||
self.delete(path).await?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Path) -> PathBuf {
|
||||
|
||||
@@ -17,6 +17,7 @@ use aws_sdk_s3::{
|
||||
error::SdkError,
|
||||
operation::get_object::GetObjectError,
|
||||
primitives::ByteStream,
|
||||
types::{Delete, ObjectIdentifier},
|
||||
Client,
|
||||
};
|
||||
use aws_smithy_http::body::SdkBody;
|
||||
@@ -33,6 +34,8 @@ use crate::{
|
||||
Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000;
|
||||
|
||||
pub(super) mod metrics {
|
||||
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -81,12 +84,24 @@ pub(super) mod metrics {
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects(count: u64) {
|
||||
S3_REQUESTS_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_delete_object_fail() {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc();
|
||||
}
|
||||
|
||||
pub fn inc_delete_objects_fail(count: u64) {
|
||||
S3_REQUESTS_FAIL_COUNT
|
||||
.with_label_values(&["delete_object"])
|
||||
.inc_by(count);
|
||||
}
|
||||
|
||||
pub fn inc_list_objects() {
|
||||
S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc();
|
||||
}
|
||||
@@ -332,6 +347,58 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// See the doc for `RemoteStorage::list_files`
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
let mut folder_name = folder
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone());
|
||||
|
||||
// remove leading "/" if one exists
|
||||
if let Some(folder_name_slash) = folder_name.clone() {
|
||||
if folder_name_slash.starts_with(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
folder_name = Some(folder_name_slash[1..].to_string());
|
||||
}
|
||||
}
|
||||
|
||||
// AWS may need to break the response into several parts
|
||||
let mut continuation_token = None;
|
||||
let mut all_files = vec![];
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list_files")?;
|
||||
metrics::inc_list_objects();
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(folder_name.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})
|
||||
.context("Failed to list files in S3 bucket")?;
|
||||
|
||||
for object in response.contents().unwrap_or_default() {
|
||||
let object_path = object.key().expect("response does not contain a key");
|
||||
let remote_path = self.s3_object_to_relative_path(object_path);
|
||||
all_files.push(remote_path);
|
||||
}
|
||||
match response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
Ok(all_files)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
@@ -396,6 +463,50 @@ impl RemoteStorage for S3Bucket {
|
||||
})
|
||||
.await
|
||||
}
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
|
||||
let mut delete_objects = Vec::with_capacity(paths.len());
|
||||
for path in paths {
|
||||
let obj_id = ObjectIdentifier::builder()
|
||||
.set_key(Some(self.relative_path_to_s3_object(path)))
|
||||
.build();
|
||||
delete_objects.push(obj_id);
|
||||
}
|
||||
|
||||
for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) {
|
||||
metrics::inc_delete_objects(chunk.len() as u64);
|
||||
|
||||
let resp = self
|
||||
.client
|
||||
.delete_objects()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
|
||||
.send()
|
||||
.await;
|
||||
|
||||
match resp {
|
||||
Ok(resp) => {
|
||||
if let Some(errors) = resp.errors {
|
||||
metrics::inc_delete_objects_fail(errors.len() as u64);
|
||||
return Err(anyhow::format_err!(
|
||||
"Failed to delete {} objects",
|
||||
errors.len()
|
||||
));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
metrics::inc_delete_objects_fail(chunk.len() as u64);
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
|
||||
@@ -24,6 +24,7 @@ enum RemoteOp {
|
||||
Upload(RemotePath),
|
||||
Download(RemotePath),
|
||||
Delete(RemotePath),
|
||||
DeleteObjects(Vec<RemotePath>),
|
||||
}
|
||||
|
||||
impl UnreliableWrapper {
|
||||
@@ -82,6 +83,11 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.inner.list_prefixes(prefix).await
|
||||
}
|
||||
|
||||
async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result<Vec<RemotePath>> {
|
||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))?;
|
||||
self.inner.list_files(folder).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
@@ -119,4 +125,21 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
self.attempt(RemoteOp::Delete(path.clone()))?;
|
||||
self.inner.delete(path).await
|
||||
}
|
||||
|
||||
async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
|
||||
self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
|
||||
let mut error_counter = 0;
|
||||
for path in paths {
|
||||
if (self.delete(path).await).is_err() {
|
||||
error_counter += 1;
|
||||
}
|
||||
}
|
||||
if error_counter > 0 {
|
||||
return Err(anyhow::anyhow!(
|
||||
"failed to delete {} objects",
|
||||
error_counter
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -88,6 +88,58 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
|
||||
/// See `s3_pagination_should_work` for more information.
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_s3_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
|
||||
anyhow::bail!("S3 init failed: {e:?}")
|
||||
}
|
||||
};
|
||||
let test_client = Arc::clone(&ctx.enabled.client);
|
||||
let base_prefix =
|
||||
RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
);
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
.remote_blobs
|
||||
.iter()
|
||||
.map(|x| x.get_path().to_str().expect("must be valid name"))
|
||||
.filter(|x| x.starts_with("folder1"))
|
||||
.map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
@@ -107,6 +159,37 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
};
|
||||
|
||||
let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
let data1 = "remote blob data1".as_bytes();
|
||||
let data1_len = data1.len();
|
||||
let data2 = "remote blob data2".as_bytes();
|
||||
let data2_len = data2.len();
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data1), data1_len, &path1, None)
|
||||
.await?;
|
||||
|
||||
ctx.client
|
||||
.upload(std::io::Cursor::new(data2), data2_len, &path2, None)
|
||||
.await?;
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2]).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_logging_ready() {
|
||||
LOGGING_DONE.get_or_init(|| {
|
||||
utils::logging::init(
|
||||
@@ -217,6 +300,66 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledS3WithSimpleTestBlobs {
|
||||
Enabled(S3WithSimpleTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
|
||||
}
|
||||
struct S3WithSimpleTestBlobs {
|
||||
enabled: EnabledS3,
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
|
||||
async fn setup() -> Self {
|
||||
ensure_logging_ready();
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
let max_keys_in_list_response = 10;
|
||||
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
|
||||
|
||||
let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
|
||||
|
||||
match upload_simple_s3_data(&enabled.client, upload_tasks_count).await {
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
|
||||
Self::Enabled(S3WithSimpleTestBlobs {
|
||||
enabled,
|
||||
remote_blobs: uploads,
|
||||
})
|
||||
}
|
||||
ControlFlow::Break(uploads) => Self::UploadsFailed(
|
||||
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
|
||||
S3WithSimpleTestBlobs {
|
||||
enabled,
|
||||
remote_blobs: uploads,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn teardown(self) {
|
||||
match self {
|
||||
Self::Disabled => {}
|
||||
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
|
||||
cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_client(
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
@@ -227,7 +370,7 @@ fn create_s3_client(
|
||||
let random_prefix_part = std::time::SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.context("random s3 test prefix part calculation")?
|
||||
.as_millis();
|
||||
.as_nanos();
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
||||
@@ -333,3 +476,52 @@ async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Uploads files `folder{j}/blob{i}.txt`. See test description for more details.
|
||||
async fn upload_simple_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
|
||||
let blob_path = RemotePath::new(&blob_path)
|
||||
.with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(std::io::Cursor::new(data), data_len, &blob_path, None)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>(blob_path)
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok(upload_path) => {
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploaded_blobs)
|
||||
} else {
|
||||
ControlFlow::Continue(uploaded_blobs)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,19 +1,18 @@
|
||||
use crate::auth::{Claims, JwtAuth};
|
||||
use crate::http::error::{api_error_handler, route_error_handler, ApiError};
|
||||
use anyhow::{anyhow, Context};
|
||||
use anyhow::Context;
|
||||
use hyper::header::{HeaderName, AUTHORIZATION};
|
||||
use hyper::http::HeaderValue;
|
||||
use hyper::Method;
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
|
||||
use hyper::{header::CONTENT_TYPE, Body, Request, Response};
|
||||
use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder, RouterService};
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tokio::task::JoinError;
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
use std::net::TcpListener;
|
||||
use std::str::FromStr;
|
||||
|
||||
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
@@ -348,40 +347,6 @@ pub fn check_permission_with(
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Start listening for HTTP requests on given socket.
|
||||
///
|
||||
/// 'shutdown_future' can be used to stop. If the Future becomes
|
||||
/// ready, we stop listening for new requests, and the function returns.
|
||||
///
|
||||
pub fn serve_thread_main<S>(
|
||||
router_builder: RouterBuilder<hyper::Body, ApiError>,
|
||||
listener: TcpListener,
|
||||
shutdown_future: S,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
S: Future<Output = ()> + Send + Sync,
|
||||
{
|
||||
info!("Starting an HTTP endpoint at {}", listener.local_addr()?);
|
||||
|
||||
// Create a Service from the router above to handle incoming requests.
|
||||
let service = RouterService::new(router_builder.build().map_err(|err| anyhow!(err))?).unwrap();
|
||||
|
||||
// Enter a single-threaded tokio runtime bound to the current thread
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let _guard = runtime.enter();
|
||||
|
||||
let server = Server::from_tcp(listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown(shutdown_future);
|
||||
|
||||
runtime.block_on(server)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use hyper::{header, Body, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::error::Error as StdError;
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
|
||||
@@ -15,7 +16,7 @@ pub enum ApiError {
|
||||
Unauthorized(String),
|
||||
|
||||
#[error("NotFound: {0}")]
|
||||
NotFound(anyhow::Error),
|
||||
NotFound(Box<dyn StdError + Send + Sync + 'static>),
|
||||
|
||||
#[error("Conflict: {0}")]
|
||||
Conflict(String),
|
||||
|
||||
@@ -495,50 +495,50 @@ fn start_pageserver(
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let background_jobs_barrier = background_jobs_barrier;
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
// The actual size calculation does need downloads, and
|
||||
// creates a child context with the right DownloadBehavior.
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
TaskKind::MetricsCollection,
|
||||
None,
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
|
||||
let background_jobs_barrier = background_jobs_barrier;
|
||||
let metrics_ctx = RequestContext::todo_child(
|
||||
TaskKind::MetricsCollection,
|
||||
// This task itself shouldn't download anything.
|
||||
// The actual size calculation does need downloads, and
|
||||
// creates a child context with the right DownloadBehavior.
|
||||
DownloadBehavior::Error,
|
||||
);
|
||||
task_mgr::spawn(
|
||||
crate::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MetricsCollection,
|
||||
None,
|
||||
None,
|
||||
"consumption metrics collection",
|
||||
true,
|
||||
async move {
|
||||
// first wait until background jobs are cleared to launch.
|
||||
//
|
||||
// this is because we only process active tenants and timelines, and the
|
||||
// Timeline::get_current_logical_size will spawn the logical size calculation,
|
||||
// which will not be rate-limited.
|
||||
let cancel = task_mgr::shutdown_token();
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => { return Ok(()); },
|
||||
_ = background_jobs_barrier.wait() => {}
|
||||
};
|
||||
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.cached_metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
metrics_ctx,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
|
||||
@@ -96,12 +96,12 @@ pub mod defaults {
|
||||
|
||||
#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
|
||||
|
||||
# [tenant_config]
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
|
||||
#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD}
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
@@ -111,7 +111,8 @@ pub mod defaults {
|
||||
#min_resident_size_override = .. # in bytes
|
||||
#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}'
|
||||
#gc_feedback = false
|
||||
# [remote_storage]
|
||||
|
||||
[remote_storage]
|
||||
|
||||
"###
|
||||
);
|
||||
|
||||
@@ -516,7 +516,7 @@ async fn collect_eviction_candidates(
|
||||
if !tl.is_active() {
|
||||
continue;
|
||||
}
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction();
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
||||
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||
tenant_candidates.extend(
|
||||
info.resident_layers
|
||||
|
||||
@@ -142,7 +142,7 @@ impl From<TenantMapInsertError> for ApiError {
|
||||
impl From<TenantStateError> for ApiError {
|
||||
fn from(tse: TenantStateError) -> ApiError {
|
||||
match tse {
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
|
||||
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
|
||||
}
|
||||
}
|
||||
@@ -151,7 +151,7 @@ impl From<TenantStateError> for ApiError {
|
||||
impl From<GetTenantError> for ApiError {
|
||||
fn from(tse: GetTenantError) -> ApiError {
|
||||
match tse {
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid)),
|
||||
GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
|
||||
e @ GetTenantError::NotActive(_) => {
|
||||
// Why is this not `ApiError::NotFound`?
|
||||
// Because we must be careful to never return 404 for a tenant if it does
|
||||
@@ -169,7 +169,7 @@ impl From<SetNewTenantConfigError> for ApiError {
|
||||
fn from(e: SetNewTenantConfigError) -> ApiError {
|
||||
match e {
|
||||
SetNewTenantConfigError::GetTenant(tid) => {
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid))
|
||||
ApiError::NotFound(anyhow!("tenant {}", tid).into())
|
||||
}
|
||||
e @ SetNewTenantConfigError::Persist(_) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(e))
|
||||
@@ -182,7 +182,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::DeleteTimelineError::*;
|
||||
match value {
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
|
||||
NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
|
||||
HasChildren(children) => ApiError::PreconditionFailed(
|
||||
format!("Cannot delete timeline which has child timelines: {children:?}")
|
||||
.into_boxed_str(),
|
||||
@@ -215,7 +215,7 @@ async fn build_timeline_info(
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let mut info = build_timeline_info_common(timeline, ctx)?;
|
||||
let mut info = build_timeline_info_common(timeline, ctx).await?;
|
||||
if include_non_incremental_logical_size {
|
||||
// XXX we should be using spawn_ondemand_logical_size_calculation here.
|
||||
// Otherwise, if someone deletes the timeline / detaches the tenant while
|
||||
@@ -233,7 +233,7 @@ async fn build_timeline_info(
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
fn build_timeline_info_common(
|
||||
async fn build_timeline_info_common(
|
||||
timeline: &Arc<Timeline>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
@@ -264,7 +264,7 @@ fn build_timeline_info_common(
|
||||
None
|
||||
}
|
||||
};
|
||||
let current_physical_size = Some(timeline.layer_size_sum());
|
||||
let current_physical_size = Some(timeline.layer_size_sum().await);
|
||||
let state = timeline.current_state();
|
||||
let remote_consistent_lsn = timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
@@ -330,6 +330,7 @@ async fn timeline_create_handler(
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::CREATED, timeline_info)
|
||||
}
|
||||
@@ -396,7 +397,7 @@ async fn timeline_detail_handler(
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, false)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let timeline_info = build_timeline_info(
|
||||
&timeline,
|
||||
@@ -591,7 +592,7 @@ async fn tenant_status(
|
||||
// Calculate total physical size of all timelines
|
||||
let mut current_physical_size = 0;
|
||||
for timeline in tenant.list_timelines().iter() {
|
||||
current_physical_size += timeline.layer_size_sum();
|
||||
current_physical_size += timeline.layer_size_sum().await;
|
||||
}
|
||||
|
||||
let state = tenant.current_state();
|
||||
@@ -701,7 +702,7 @@ async fn layer_map_info_handler(
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
|
||||
let layer_map_info = timeline.layer_map_info(reset);
|
||||
let layer_map_info = timeline.layer_map_info(reset).await;
|
||||
|
||||
json_response(StatusCode::OK, layer_map_info)
|
||||
}
|
||||
@@ -1060,7 +1061,7 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
let info = timeline
|
||||
.get_download_all_remote_layers_task_info()
|
||||
.context("task never started since last pageserver process start")
|
||||
.map_err(ApiError::NotFound)?;
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
json_response(StatusCode::OK, info)
|
||||
}
|
||||
|
||||
@@ -1071,7 +1072,7 @@ async fn active_timeline_of_active_tenant(
|
||||
let tenant = mgr::get_tenant(tenant_id, true).await?;
|
||||
tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)
|
||||
.map_err(|e| ApiError::NotFound(e.into()))
|
||||
}
|
||||
|
||||
async fn always_panic_handler(
|
||||
@@ -1127,8 +1128,6 @@ async fn disk_usage_eviction_run(
|
||||
freed_bytes: 0,
|
||||
};
|
||||
|
||||
use crate::task_mgr::MGMT_REQUEST_RUNTIME;
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
|
||||
let state = get_state(&r);
|
||||
@@ -1146,7 +1145,7 @@ async fn disk_usage_eviction_run(
|
||||
let _g = cancel.drop_guard();
|
||||
|
||||
crate::task_mgr::spawn(
|
||||
MGMT_REQUEST_RUNTIME.handle(),
|
||||
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::DiskUsageEviction,
|
||||
None,
|
||||
None,
|
||||
|
||||
@@ -75,12 +75,12 @@ pub async fn import_timeline_from_postgres_datadir(
|
||||
{
|
||||
pg_control = Some(control_file);
|
||||
}
|
||||
modification.flush()?;
|
||||
modification.flush().await?;
|
||||
}
|
||||
}
|
||||
|
||||
// We're done importing all the data files.
|
||||
modification.commit()?;
|
||||
modification.commit().await?;
|
||||
|
||||
// We expect the Postgres server to be shut down cleanly.
|
||||
let pg_control = pg_control.context("pg_control file not found")?;
|
||||
@@ -148,17 +148,17 @@ async fn import_rel(
|
||||
// because there is no guarantee about the order in which we are processing segments.
|
||||
// ignore "relation already exists" error
|
||||
//
|
||||
// FIXME: use proper error type for this, instead of parsing the error message.
|
||||
// Or better yet, keep track of which relations we've already created
|
||||
// FIXME: Keep track of which relations we've already created?
|
||||
// https://github.com/neondatabase/neon/issues/3309
|
||||
if let Err(e) = modification
|
||||
.put_rel_creation(rel, nblocks as u32, ctx)
|
||||
.await
|
||||
{
|
||||
if e.to_string().contains("already exists") {
|
||||
debug!("relation {} already exists. we must be extending it", rel);
|
||||
} else {
|
||||
return Err(e);
|
||||
match e {
|
||||
RelationError::AlreadyExists => {
|
||||
debug!("Relation {} already exist. We must be extending it.", rel)
|
||||
}
|
||||
_ => return Err(e.into()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -359,7 +359,7 @@ pub async fn import_basebackup_from_tar(
|
||||
// We found the pg_control file.
|
||||
pg_control = Some(res);
|
||||
}
|
||||
modification.flush()?;
|
||||
modification.flush().await?;
|
||||
}
|
||||
tokio_tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
@@ -377,7 +377,7 @@ pub async fn import_basebackup_from_tar(
|
||||
// sanity check: ensure that pg_control is loaded
|
||||
let _pg_control = pg_control.context("pg_control file not found")?;
|
||||
|
||||
modification.commit()?;
|
||||
modification.commit().await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -594,7 +594,7 @@ async fn import_file(
|
||||
// zenith.signal is not necessarily the last file, that we handle
|
||||
// but it is ok to call `finish_write()`, because final `modification.commit()`
|
||||
// will update lsn once more to the final one.
|
||||
let writer = modification.tline.writer();
|
||||
let writer = modification.tline.writer().await;
|
||||
writer.finish_write(prev_lsn);
|
||||
|
||||
debug!("imported zenith signal {}", prev_lsn);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use metrics::metric_vec_duration::DurationResultObserver;
|
||||
use metrics::{
|
||||
register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
@@ -95,21 +95,19 @@ static READ_NUM_FS_LAYERS: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
// Metrics collected on operations on the storage repository.
|
||||
static RECONSTRUCT_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_getpage_reconstruct_seconds",
|
||||
"Time spent in reconstruct_value",
|
||||
&["tenant_id", "timeline_id"],
|
||||
"Time spent in reconstruct_value (reconstruct a page from deltas)",
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"Number of cache hits from materialized page cache without redo",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -124,11 +122,10 @@ static GET_RECONSTRUCT_DATA_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"Number of cache hits from materialized page cache",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -428,6 +425,27 @@ pub static SMGR_QUERY_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub struct BasebackupQueryTime(HistogramVec);
|
||||
pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
BasebackupQueryTime({
|
||||
register_histogram_vec!(
|
||||
"pageserver_basebackup_query_seconds",
|
||||
"Histogram of basebackup queries durations, by result type",
|
||||
&["result"],
|
||||
CRITICAL_OP_BUCKETS.into(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
})
|
||||
});
|
||||
|
||||
impl DurationResultObserver for BasebackupQueryTime {
|
||||
fn observe_result<T, E>(&self, res: &Result<T, E>, duration: std::time::Duration) {
|
||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||
let metric = self.0.get_metric_with_label_values(&[label_value]).unwrap();
|
||||
metric.observe(duration.as_secs_f64());
|
||||
}
|
||||
}
|
||||
|
||||
pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_live_connections",
|
||||
@@ -752,10 +770,7 @@ impl StorageTimeMetrics {
|
||||
pub struct TimelineMetrics {
|
||||
tenant_id: String,
|
||||
timeline_id: String,
|
||||
pub reconstruct_time_histo: Histogram,
|
||||
pub get_reconstruct_data_time_histo: Histogram,
|
||||
pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
|
||||
pub materialized_page_cache_hit_upon_request_counter: GenericCounter<AtomicU64>,
|
||||
pub flush_time_histo: StorageTimeMetrics,
|
||||
pub compact_time_histo: StorageTimeMetrics,
|
||||
pub create_images_time_histo: StorageTimeMetrics,
|
||||
@@ -783,15 +798,9 @@ impl TimelineMetrics {
|
||||
) -> Self {
|
||||
let tenant_id = tenant_id.to_string();
|
||||
let timeline_id = timeline_id.to_string();
|
||||
let reconstruct_time_histo = RECONSTRUCT_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let get_reconstruct_data_time_histo = GET_RECONSTRUCT_DATA_TIME
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let flush_time_histo =
|
||||
StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
|
||||
let compact_time_histo =
|
||||
@@ -833,19 +842,13 @@ impl TimelineMetrics {
|
||||
let read_num_fs_layers = READ_NUM_FS_LAYERS
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let materialized_page_cache_hit_upon_request_counter = MATERIALIZED_PAGE_CACHE_HIT_DIRECT
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let evictions_with_low_residence_duration =
|
||||
evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
|
||||
|
||||
TimelineMetrics {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
reconstruct_time_histo,
|
||||
get_reconstruct_data_time_histo,
|
||||
materialized_page_cache_hit_counter,
|
||||
materialized_page_cache_hit_upon_request_counter,
|
||||
flush_time_histo,
|
||||
compact_time_histo,
|
||||
create_images_time_histo,
|
||||
@@ -872,10 +875,7 @@ impl Drop for TimelineMetrics {
|
||||
fn drop(&mut self) {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let _ = RECONSTRUCT_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = GET_RECONSTRUCT_DATA_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = MATERIALIZED_PAGE_CACHE_HIT.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = MATERIALIZED_PAGE_CACHE_HIT_DIRECT.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = WAIT_LSN_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
|
||||
@@ -1319,4 +1319,8 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
|
||||
BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
|
||||
|
||||
// Python tests need these.
|
||||
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
|
||||
MATERIALIZED_PAGE_CACHE_HIT.get();
|
||||
}
|
||||
|
||||
@@ -390,7 +390,9 @@ impl PageServerHandler {
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
// switch client to COPYBOTH
|
||||
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
|
||||
@@ -911,10 +913,24 @@ where
|
||||
None
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
metrics::metric_vec_duration::observe_async_block_duration_by_result(
|
||||
&*crate::metrics::BASEBACKUP_QUERY_TIME,
|
||||
async move {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
anyhow::Ok(())
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
@@ -1230,6 +1246,6 @@ async fn get_active_tenant_timeline(
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(GetActiveTimelineError::Timeline)?;
|
||||
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -43,6 +43,16 @@ pub enum CalculateLogicalSizeError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum RelationError {
|
||||
#[error("Relation Already Exists")]
|
||||
AlreadyExists,
|
||||
#[error("invalid relnode")]
|
||||
InvalidRelnode,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
///
|
||||
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
|
||||
/// and other special kinds of files, in a versioned key-value store. The
|
||||
@@ -101,9 +111,9 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
return Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
));
|
||||
}
|
||||
|
||||
let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
|
||||
@@ -148,9 +158,9 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
return Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
@@ -193,9 +203,9 @@ impl Timeline {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
return Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
));
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
@@ -699,6 +709,20 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn init_empty_test_timeline(&mut self) -> anyhow::Result<()> {
|
||||
self.init_empty()?;
|
||||
self.put_control_file(bytes::Bytes::from_static(
|
||||
b"control_file contents do not matter",
|
||||
))
|
||||
.context("put_control_file")?;
|
||||
self.put_checkpoint(bytes::Bytes::from_static(
|
||||
b"checkpoint_file contents do not matter",
|
||||
))
|
||||
.context("put_checkpoint_file")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
|
||||
@@ -710,7 +734,7 @@ impl<'a> DatadirModification<'a> {
|
||||
blknum: BlockNumber,
|
||||
rec: NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
|
||||
Ok(())
|
||||
}
|
||||
@@ -737,7 +761,7 @@ impl<'a> DatadirModification<'a> {
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
self.put(rel_block_to_key(rel, blknum), Value::Image(img));
|
||||
Ok(())
|
||||
}
|
||||
@@ -861,32 +885,38 @@ impl<'a> DatadirModification<'a> {
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
) -> Result<(), RelationError> {
|
||||
if rel.relnode == 0 {
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?;
|
||||
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
|
||||
// Didn't exist. Update dbdir
|
||||
dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
|
||||
let buf = DbDirectory::ser(&dbdir)?;
|
||||
let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
|
||||
self.put(DBDIR_KEY, Value::Image(buf.into()));
|
||||
|
||||
// and create the RelDirectory
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
|
||||
RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
|
||||
.context("deserialize db")?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
|
||||
anyhow::bail!("rel {rel} already exists");
|
||||
return Err(RelationError::AlreadyExists);
|
||||
}
|
||||
self.put(
|
||||
rel_dir_key,
|
||||
Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
|
||||
Value::Image(Bytes::from(
|
||||
RelDirectory::ser(&rel_dir).context("serialize")?,
|
||||
)),
|
||||
);
|
||||
|
||||
// Put size
|
||||
@@ -911,7 +941,7 @@ impl<'a> DatadirModification<'a> {
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
let last_lsn = self.tline.get_last_record_lsn();
|
||||
if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
|
||||
let size_key = rel_size_to_key(rel);
|
||||
@@ -942,7 +972,7 @@ impl<'a> DatadirModification<'a> {
|
||||
nblocks: BlockNumber,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
@@ -963,7 +993,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
/// Drop a relation.
|
||||
pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
|
||||
|
||||
// Remove it from the directory entry
|
||||
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
@@ -1108,7 +1138,7 @@ impl<'a> DatadirModification<'a> {
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub fn flush(&mut self) -> anyhow::Result<()> {
|
||||
pub async fn flush(&mut self) -> anyhow::Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -1116,19 +1146,20 @@ impl<'a> DatadirModification<'a> {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let writer = self.tline.writer();
|
||||
let writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut result: anyhow::Result<()> = Ok(());
|
||||
self.pending_updates.retain(|&key, value| {
|
||||
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
|
||||
result = writer.put(key, self.lsn, value);
|
||||
false
|
||||
let mut retained_pending_updates = HashMap::new();
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
if is_rel_block_key(key) || is_slru_block_key(key) {
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put(key, self.lsn, &value).await?;
|
||||
} else {
|
||||
true
|
||||
retained_pending_updates.insert(key, value);
|
||||
}
|
||||
});
|
||||
result?;
|
||||
}
|
||||
self.pending_updates.extend(retained_pending_updates);
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1143,17 +1174,17 @@ impl<'a> DatadirModification<'a> {
|
||||
/// underlying timeline.
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub fn commit(&mut self) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer();
|
||||
pub async fn commit(&mut self) -> anyhow::Result<()> {
|
||||
let writer = self.tline.writer().await;
|
||||
let lsn = self.lsn;
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
self.pending_nblocks = 0;
|
||||
|
||||
for (key, value) in self.pending_updates.drain() {
|
||||
writer.put(key, lsn, &value)?;
|
||||
writer.put(key, lsn, &value).await?;
|
||||
}
|
||||
for key_range in self.pending_deletions.drain(..) {
|
||||
writer.delete(key_range, lsn)?;
|
||||
writer.delete(key_range, lsn).await?;
|
||||
}
|
||||
|
||||
writer.finish_write(lsn);
|
||||
@@ -1593,20 +1624,6 @@ fn is_slru_block_key(key: Key) -> bool {
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn create_test_timeline(
|
||||
tenant: &crate::tenant::Tenant,
|
||||
timeline_id: utils::id::TimelineId,
|
||||
pg_version: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<std::sync::Arc<Timeline>> {
|
||||
let tline = tenant.create_test_timeline(timeline_id, Lsn(8), pg_version, ctx)?;
|
||||
let mut m = tline.begin_modification(Lsn(8));
|
||||
m.init_empty()?;
|
||||
m.commit()?;
|
||||
Ok(tline)
|
||||
}
|
||||
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -38,8 +38,8 @@ pub mod defaults {
|
||||
pub const DEFAULT_GC_PERIOD: &str = "1 hr";
|
||||
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
}
|
||||
|
||||
325
pageserver/src/tenant/manifest.rs
Normal file
325
pageserver/src/tenant/manifest.rs
Normal file
@@ -0,0 +1,325 @@
|
||||
//! This module contains the encoding and decoding of the local manifest file.
|
||||
//!
|
||||
//! MANIFEST is a write-ahead log which is stored locally to each timeline. It
|
||||
//! records the state of the storage engine. It contains a snapshot of the
|
||||
//! state and all operations proceeding that snapshot. The file begins with a
|
||||
//! header recording MANIFEST version number. After that, it contains a snapshot.
|
||||
//! The snapshot is followed by a list of operations. Each operation is a list
|
||||
//! of records. Each record is either an addition or a removal of a layer.
|
||||
//!
|
||||
//! With MANIFEST, we can:
|
||||
//!
|
||||
//! 1. recover state quickly by reading the file, potentially boosting the
|
||||
//! startup speed.
|
||||
//! 2. ensure all operations are atomic and avoid corruption, solving issues
|
||||
//! like redundant image layer and preparing us for future compaction
|
||||
//! strategies.
|
||||
//!
|
||||
//! There is also a format for storing all layer files on S3, called
|
||||
//! `index_part.json`. Compared with index_part, MANIFEST is an WAL which
|
||||
//! records all operations as logs, and therefore we can easily replay the
|
||||
//! operations when recovering from crash, while ensuring those operations
|
||||
//! are atomic upon restart.
|
||||
//!
|
||||
//! Currently, this is not used in the system. Future refactors will ensure
|
||||
//! the storage state will be recorded in this file, and the system can be
|
||||
//! recovered from this file. This is tracked in
|
||||
//! https://github.com/neondatabase/neon/issues/4418
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::crc32c;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::log::warn;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
|
||||
pub struct Manifest {
|
||||
file: VirtualFile,
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub struct Snapshot {
|
||||
pub layers: Vec<PersistentLayerDesc>,
|
||||
}
|
||||
|
||||
/// serde by default encode this in tagged enum, and therefore it will be something
|
||||
/// like `{ "AddLayer": { ... } }`.
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub enum Record {
|
||||
AddLayer(PersistentLayerDesc),
|
||||
RemoveLayer(PersistentLayerDesc),
|
||||
}
|
||||
|
||||
/// `echo neon.manifest | sha1sum` and take the leading 8 bytes.
|
||||
const MANIFEST_MAGIC_NUMBER: u64 = 0xf5c44592b806109c;
|
||||
const MANIFEST_VERSION: u64 = 1;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub struct ManifestHeader {
|
||||
magic_number: u64,
|
||||
version: u64,
|
||||
}
|
||||
|
||||
const MANIFEST_HEADER_LEN: usize = 16;
|
||||
|
||||
impl ManifestHeader {
|
||||
fn encode(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::with_capacity(MANIFEST_HEADER_LEN);
|
||||
buf.put_u64(self.magic_number);
|
||||
buf.put_u64(self.version);
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode(mut buf: &[u8]) -> Self {
|
||||
assert!(buf.len() == MANIFEST_HEADER_LEN, "invalid header");
|
||||
Self {
|
||||
magic_number: buf.get_u64(),
|
||||
version: buf.get_u64(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Debug)]
|
||||
pub enum Operation {
|
||||
/// A snapshot of the current state.
|
||||
///
|
||||
/// Lsn field represents the LSN that is persisted to disk for this snapshot.
|
||||
Snapshot(Snapshot, Lsn),
|
||||
/// An atomic operation that changes the state.
|
||||
///
|
||||
/// Lsn field represents the LSN that is persisted to disk after the operation is done.
|
||||
/// This will only change when new L0 is flushed to the disk.
|
||||
Operation(Vec<Record>, Lsn),
|
||||
}
|
||||
|
||||
struct RecordHeader {
|
||||
size: u32,
|
||||
checksum: u32,
|
||||
}
|
||||
|
||||
const RECORD_HEADER_LEN: usize = 8;
|
||||
|
||||
impl RecordHeader {
|
||||
fn encode(&self) -> BytesMut {
|
||||
let mut buf = BytesMut::with_capacity(RECORD_HEADER_LEN);
|
||||
buf.put_u32(self.size);
|
||||
buf.put_u32(self.checksum);
|
||||
buf
|
||||
}
|
||||
|
||||
fn decode(mut buf: &[u8]) -> Self {
|
||||
assert!(buf.len() == RECORD_HEADER_LEN, "invalid header");
|
||||
Self {
|
||||
size: buf.get_u32(),
|
||||
checksum: buf.get_u32(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ManifestLoadError {
|
||||
#[error("manifest header is corrupted")]
|
||||
CorruptedManifestHeader,
|
||||
#[error("unsupported manifest version: got {0}, expected {1}")]
|
||||
UnsupportedVersion(u64, u64),
|
||||
#[error("error when decoding record: {0}")]
|
||||
DecodeRecord(serde_json::Error),
|
||||
#[error("I/O error: {0}")]
|
||||
Io(io::Error),
|
||||
}
|
||||
|
||||
#[must_use = "Should check if the manifest is partially corrupted"]
|
||||
pub struct ManifestPartiallyCorrupted(bool);
|
||||
|
||||
impl Manifest {
|
||||
/// Create a new manifest by writing the manifest header and a snapshot record to the given file.
|
||||
pub fn init(file: VirtualFile, snapshot: Snapshot, lsn: Lsn) -> Result<Self> {
|
||||
let mut manifest = Self { file };
|
||||
manifest.append_manifest_header(ManifestHeader {
|
||||
magic_number: MANIFEST_MAGIC_NUMBER,
|
||||
version: MANIFEST_VERSION,
|
||||
})?;
|
||||
manifest.append_operation(Operation::Snapshot(snapshot, lsn))?;
|
||||
Ok(manifest)
|
||||
}
|
||||
|
||||
/// Load a manifest. Returns the manifest and a list of operations. If the manifest is corrupted,
|
||||
/// the bool flag will be set to true and the user is responsible to reconstruct a new manifest and
|
||||
/// backup the current one.
|
||||
pub fn load(
|
||||
mut file: VirtualFile,
|
||||
) -> Result<(Self, Vec<Operation>, ManifestPartiallyCorrupted), ManifestLoadError> {
|
||||
let mut buf = vec![];
|
||||
file.read_to_end(&mut buf).map_err(ManifestLoadError::Io)?;
|
||||
|
||||
// Read manifest header
|
||||
let mut buf = Bytes::from(buf);
|
||||
if buf.remaining() < MANIFEST_HEADER_LEN {
|
||||
return Err(ManifestLoadError::CorruptedManifestHeader);
|
||||
}
|
||||
let header = ManifestHeader::decode(&buf[..MANIFEST_HEADER_LEN]);
|
||||
buf.advance(MANIFEST_HEADER_LEN);
|
||||
if header.version != MANIFEST_VERSION {
|
||||
return Err(ManifestLoadError::UnsupportedVersion(
|
||||
header.version,
|
||||
MANIFEST_VERSION,
|
||||
));
|
||||
}
|
||||
|
||||
// Read operations
|
||||
let mut operations = Vec::new();
|
||||
let corrupted = loop {
|
||||
if buf.remaining() == 0 {
|
||||
break false;
|
||||
}
|
||||
if buf.remaining() < RECORD_HEADER_LEN {
|
||||
warn!("incomplete header when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
let RecordHeader { size, checksum } = RecordHeader::decode(&buf[..RECORD_HEADER_LEN]);
|
||||
let size = size as usize;
|
||||
buf.advance(RECORD_HEADER_LEN);
|
||||
if buf.remaining() < size {
|
||||
warn!("incomplete data when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
let data = &buf[..size];
|
||||
if crc32c(data) != checksum {
|
||||
warn!("checksum mismatch when decoding manifest, could be corrupted");
|
||||
break true;
|
||||
}
|
||||
// if the following decode fails, we cannot use the manifest or safely ignore any record.
|
||||
operations.push(serde_json::from_slice(data).map_err(ManifestLoadError::DecodeRecord)?);
|
||||
buf.advance(size);
|
||||
};
|
||||
Ok((
|
||||
Self { file },
|
||||
operations,
|
||||
ManifestPartiallyCorrupted(corrupted),
|
||||
))
|
||||
}
|
||||
|
||||
fn append_data(&mut self, data: &[u8]) -> Result<()> {
|
||||
if data.len() >= u32::MAX as usize {
|
||||
panic!("data too large");
|
||||
}
|
||||
let header = RecordHeader {
|
||||
size: data.len() as u32,
|
||||
checksum: crc32c(data),
|
||||
};
|
||||
let header = header.encode();
|
||||
self.file.write_all(&header)?;
|
||||
self.file.write_all(data)?;
|
||||
self.file.sync_all()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn append_manifest_header(&mut self, header: ManifestHeader) -> Result<()> {
|
||||
let encoded = header.encode();
|
||||
self.file.write_all(&encoded)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add an operation to the manifest. The operation will be appended to the end of the file,
|
||||
/// and the file will fsync.
|
||||
pub fn append_operation(&mut self, operation: Operation) -> Result<()> {
|
||||
let encoded = Vec::from(serde_json::to_string(&operation)?);
|
||||
self.append_data(&encoded)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::fs::OpenOptions;
|
||||
|
||||
use crate::repository::Key;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_read_manifest() {
|
||||
let testdir = crate::config::PageServerConf::test_repo_dir("test_read_manifest");
|
||||
std::fs::create_dir_all(&testdir).unwrap();
|
||||
let file = VirtualFile::create(&testdir.join("MANIFEST")).unwrap();
|
||||
let layer1 = PersistentLayerDesc::new_test(Key::from_i128(0)..Key::from_i128(233));
|
||||
let layer2 = PersistentLayerDesc::new_test(Key::from_i128(233)..Key::from_i128(2333));
|
||||
let layer3 = PersistentLayerDesc::new_test(Key::from_i128(2333)..Key::from_i128(23333));
|
||||
let layer4 = PersistentLayerDesc::new_test(Key::from_i128(23333)..Key::from_i128(233333));
|
||||
|
||||
// Write a manifest with a snapshot and some operations
|
||||
let snapshot = Snapshot {
|
||||
layers: vec![layer1, layer2],
|
||||
};
|
||||
let mut manifest = Manifest::init(file, snapshot.clone(), Lsn::from(0)).unwrap();
|
||||
manifest
|
||||
.append_operation(Operation::Operation(
|
||||
vec![Record::AddLayer(layer3.clone())],
|
||||
Lsn::from(1),
|
||||
))
|
||||
.unwrap();
|
||||
drop(manifest);
|
||||
|
||||
// Open the second time and write
|
||||
let file = VirtualFile::open_with_options(
|
||||
&testdir.join("MANIFEST"),
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create_new(false)
|
||||
.truncate(false),
|
||||
)
|
||||
.unwrap();
|
||||
let (mut manifest, operations, corrupted) = Manifest::load(file).unwrap();
|
||||
assert!(!corrupted.0);
|
||||
assert_eq!(operations.len(), 2);
|
||||
assert_eq!(
|
||||
&operations[0],
|
||||
&Operation::Snapshot(snapshot.clone(), Lsn::from(0))
|
||||
);
|
||||
assert_eq!(
|
||||
&operations[1],
|
||||
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
|
||||
);
|
||||
manifest
|
||||
.append_operation(Operation::Operation(
|
||||
vec![
|
||||
Record::RemoveLayer(layer3.clone()),
|
||||
Record::AddLayer(layer4.clone()),
|
||||
],
|
||||
Lsn::from(2),
|
||||
))
|
||||
.unwrap();
|
||||
drop(manifest);
|
||||
|
||||
// Open the third time and verify
|
||||
let file = VirtualFile::open_with_options(
|
||||
&testdir.join("MANIFEST"),
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create_new(false)
|
||||
.truncate(false),
|
||||
)
|
||||
.unwrap();
|
||||
let (_manifest, operations, corrupted) = Manifest::load(file).unwrap();
|
||||
assert!(!corrupted.0);
|
||||
assert_eq!(operations.len(), 3);
|
||||
assert_eq!(&operations[0], &Operation::Snapshot(snapshot, Lsn::from(0)));
|
||||
assert_eq!(
|
||||
&operations[1],
|
||||
&Operation::Operation(vec![Record::AddLayer(layer3.clone())], Lsn::from(1))
|
||||
);
|
||||
assert_eq!(
|
||||
&operations[2],
|
||||
&Operation::Operation(
|
||||
vec![Record::RemoveLayer(layer3), Record::AddLayer(layer4)],
|
||||
Lsn::from(2)
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -675,7 +675,7 @@ pub async fn immediate_gc(
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
|
||||
// Use tenant's pitr setting
|
||||
@@ -724,11 +724,11 @@ pub async fn immediate_compact(
|
||||
.get(&tenant_id)
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_id}"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
// Run in task_mgr to avoid race with tenant_detach operation
|
||||
let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
|
||||
|
||||
@@ -753,22 +753,18 @@ impl RemoteTimelineClient {
|
||||
|
||||
// Have a failpoint that can use the `pause` failpoint action.
|
||||
// We don't want to block the executor thread, hence, spawn_blocking + await.
|
||||
#[cfg(feature = "testing")]
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!(
|
||||
"at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
|
||||
);
|
||||
fail::fail_point!(
|
||||
"persist_index_part_with_deleted_flag_after_set_before_upload_pause"
|
||||
);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
|
||||
if cfg!(feature = "testing") {
|
||||
tokio::task::spawn_blocking({
|
||||
let current = tracing::Span::current();
|
||||
move || {
|
||||
let _entered = current.entered();
|
||||
tracing::info!("at failpoint persist_deleted_index_part");
|
||||
fail::fail_point!("persist_deleted_index_part");
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("spawn_blocking");
|
||||
}
|
||||
upload::upload_index_part(
|
||||
self.conf,
|
||||
&self.storage_impl,
|
||||
@@ -1371,7 +1367,7 @@ mod tests {
|
||||
struct TestSetup {
|
||||
runtime: &'static tokio::runtime::Runtime,
|
||||
entered_runtime: EnterGuard<'static>,
|
||||
harness: TenantHarness<'static>,
|
||||
harness: TenantHarness,
|
||||
tenant: Arc<Tenant>,
|
||||
tenant_ctx: RequestContext,
|
||||
remote_fs_dir: PathBuf,
|
||||
@@ -1392,7 +1388,12 @@ mod tests {
|
||||
let harness = TenantHarness::create(test_name)?;
|
||||
let (tenant, ctx) = runtime.block_on(harness.load());
|
||||
// create an empty timeline directory
|
||||
let _ = tenant.create_test_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
|
||||
let _ = runtime.block_on(tenant.create_test_timeline(
|
||||
TIMELINE_ID,
|
||||
Lsn(8),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
))?;
|
||||
|
||||
let remote_fs_dir = harness.conf.workdir.join("remote_fs");
|
||||
std::fs::create_dir_all(remote_fs_dir)?;
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::context::RequestContext;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
use anyhow::{Context, Result};
|
||||
use bytes::Bytes;
|
||||
use enum_map::EnumMap;
|
||||
use enumset::EnumSet;
|
||||
@@ -343,7 +343,8 @@ impl LayerAccessStats {
|
||||
/// All layers should implement a minimal `std::fmt::Debug` without tenant or
|
||||
/// timeline names, because those are known in the context of which the layers
|
||||
/// are used in (timeline).
|
||||
pub trait Layer: std::fmt::Debug + Send + Sync {
|
||||
#[async_trait::async_trait]
|
||||
pub trait Layer: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// Range of keys that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
@@ -373,13 +374,42 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
reconstruct_data: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)>;
|
||||
|
||||
/// CANCEL SAFETY: if the returned future is dropped,
|
||||
/// the wrapped closure still run to completion and the return value discarded.
|
||||
/// For the case of get_value_reconstruct_data, we expect the closure to not
|
||||
/// have any side effects, as it only attempts to read a layer (and stuff like
|
||||
/// page cache isn't considered a real side effect).
|
||||
/// But, ...
|
||||
/// TRACING:
|
||||
/// If the returned future is cancelled, the spawn_blocking span can outlive
|
||||
/// the caller's span.
|
||||
/// So, technically, we should be using `parent: None` and `follows_from: current`
|
||||
/// instead. However, in practice, the advantage of maintaining the span stack
|
||||
/// in logs outweighs the disadvantage of having a dangling span in a case that
|
||||
/// is not expected to happen because in pageserver we generally don't drop pending futures.
|
||||
async fn get_value_reconstruct_data(
|
||||
self: Arc<Self>,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
let span = tracing::info_span!("get_value_reconstruct_data_spawn_blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = span.enter();
|
||||
self.get_value_reconstruct_data_blocking(key, lsn_range, reconstruct_data, ctx)
|
||||
})
|
||||
.await
|
||||
.context("spawn_blocking")?
|
||||
}
|
||||
|
||||
/// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
|
||||
fn short_id(&self) -> String;
|
||||
@@ -389,10 +419,10 @@ pub trait Layer: std::fmt::Debug + Send + Sync {
|
||||
}
|
||||
|
||||
/// Returned by [`Layer::iter`]
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i>;
|
||||
pub type LayerIter<'i> = Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + 'i + Send>;
|
||||
|
||||
/// Returned by [`Layer::key_iter`]
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i>;
|
||||
pub type LayerKeyIter<'i> = Box<dyn Iterator<Item = (Key, Lsn, u64)> + 'i + Send>;
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
@@ -499,6 +529,7 @@ impl LayerDescriptor {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for LayerDescriptor {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key.clone()
|
||||
@@ -512,13 +543,13 @@ impl Layer for LayerDescriptor {
|
||||
self.is_incremental
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_data: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
_reconstruct_data: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
todo!("This method shouldn't be part of the Layer trait")
|
||||
}
|
||||
|
||||
|
||||
@@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -46,7 +47,6 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
@@ -184,7 +184,7 @@ pub struct DeltaLayer {
|
||||
|
||||
access_stats: LayerAccessStats,
|
||||
|
||||
inner: RwLock<DeltaLayerInner>,
|
||||
inner: OnceCell<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayer {
|
||||
@@ -201,27 +201,24 @@ impl std::fmt::Debug for DeltaLayer {
|
||||
}
|
||||
|
||||
pub struct DeltaLayerInner {
|
||||
/// If false, the fields below have not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
// values copied from summary
|
||||
index_start_blk: u32,
|
||||
index_root_blk: u32,
|
||||
|
||||
/// Reader object for reading blocks from the file. (None if not loaded yet)
|
||||
file: Option<FileBlockReader<VirtualFile>>,
|
||||
/// Reader object for reading blocks from the file.
|
||||
file: FileBlockReader<VirtualFile>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for DeltaLayerInner {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("DeltaLayerInner")
|
||||
.field("loaded", &self.loaded)
|
||||
.field("index_start_blk", &self.index_start_blk)
|
||||
.field("index_root_blk", &self.index_root_blk)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
@@ -246,7 +243,7 @@ impl Layer for DeltaLayer {
|
||||
inner.index_start_blk, inner.index_root_blk
|
||||
);
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
@@ -298,13 +295,13 @@ impl Layer for DeltaLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
ensure!(lsn_range.start >= self.desc.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
@@ -312,10 +309,10 @@ impl Layer for DeltaLayer {
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
@@ -378,9 +375,9 @@ impl Layer for DeltaLayer {
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -500,51 +497,22 @@ impl DeltaLayer {
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
///
|
||||
fn load(
|
||||
&self,
|
||||
access_kind: LayerAccessKind,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<RwLockReadGuard<DeltaLayerInner>> {
|
||||
fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> {
|
||||
self.access_stats
|
||||
.record_access(access_kind, ctx.task_kind());
|
||||
loop {
|
||||
// Quick exit if already loaded
|
||||
let inner = self.inner.read().unwrap();
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
// Need to open the file and load the metadata. Upgrade our lock to
|
||||
// a write lock. (Or rather, release and re-lock in write mode.)
|
||||
drop(inner);
|
||||
let inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(inner).with_context(|| {
|
||||
format!("Failed to load delta layer {}", self.path().display())
|
||||
})?;
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
|
||||
// We now have the file open and loaded. There's no function to do
|
||||
// that in the std library RwLock, so we have to release and re-lock
|
||||
// in read mode. (To be precise, the lock guard was moved in the
|
||||
// above call to `load_inner`, so it's already been released). And
|
||||
// while we do that, another thread could unload again, so we have
|
||||
// to re-check and retry if that happens.
|
||||
}
|
||||
// Quick exit if already loaded
|
||||
self.inner
|
||||
.get_or_try_init(|| self.load_inner())
|
||||
.with_context(|| format!("Failed to load delta layer {}", self.path().display()))
|
||||
}
|
||||
|
||||
fn load_inner(&self, mut inner: RwLockWriteGuard<DeltaLayerInner>) -> Result<()> {
|
||||
fn load_inner(&self) -> Result<DeltaLayerInner> {
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.file.is_none() {
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
inner.file = Some(FileBlockReader::new(file));
|
||||
}
|
||||
let file = inner.file.as_mut().unwrap();
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
let file = FileBlockReader::new(file);
|
||||
|
||||
let summary_blk = file.read_blk(0)?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
|
||||
@@ -571,13 +539,13 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
inner.index_start_blk = actual_summary.index_start_blk;
|
||||
inner.index_root_blk = actual_summary.index_root_blk;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
inner.loaded = true;
|
||||
Ok(())
|
||||
Ok(DeltaLayerInner {
|
||||
file,
|
||||
index_start_blk: actual_summary.index_start_blk,
|
||||
index_root_blk: actual_summary.index_root_blk,
|
||||
})
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
@@ -599,12 +567,7 @@ impl DeltaLayer {
|
||||
file_size,
|
||||
),
|
||||
access_stats,
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -631,12 +594,7 @@ impl DeltaLayer {
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk: 0,
|
||||
index_root_blk: 0,
|
||||
}),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -800,12 +758,7 @@ impl DeltaLayerWriterInner {
|
||||
metadata.len(),
|
||||
),
|
||||
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
file: None,
|
||||
index_start_blk,
|
||||
index_root_blk,
|
||||
}),
|
||||
inner: once_cell::sync::OnceCell::new(),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
@@ -940,13 +893,13 @@ struct DeltaValueIter<'a> {
|
||||
reader: BlockCursor<Adapter<'a>>,
|
||||
}
|
||||
|
||||
struct Adapter<'a>(RwLockReadGuard<'a, DeltaLayerInner>);
|
||||
struct Adapter<'a>(&'a DeltaLayerInner);
|
||||
|
||||
impl<'a> BlockReader for Adapter<'a> {
|
||||
type BlockLease = PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
self.0.file.as_ref().unwrap().read_blk(blknum)
|
||||
self.0.file.read_blk(blknum)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -959,8 +912,8 @@ impl<'a> Iterator for DeltaValueIter<'a> {
|
||||
}
|
||||
|
||||
impl<'a> DeltaValueIter<'a> {
|
||||
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
@@ -1033,8 +986,8 @@ impl Iterator for DeltaKeyIter {
|
||||
}
|
||||
|
||||
impl<'a> DeltaKeyIter {
|
||||
fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result<Self> {
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
fn new(inner: &'a DeltaLayerInner) -> Result<Self> {
|
||||
let file = &inner.file;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
inner.index_start_blk,
|
||||
inner.index_root_blk,
|
||||
@@ -1074,3 +1027,21 @@ impl<'a> DeltaKeyIter {
|
||||
Ok(iter)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::DeltaKeyIter;
|
||||
use super::DeltaLayer;
|
||||
use super::DeltaValueIter;
|
||||
|
||||
// We will soon need the iters to be send in the compaction code.
|
||||
// Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883
|
||||
// Cf https://github.com/neondatabase/neon/issues/4471
|
||||
#[test]
|
||||
fn is_send() {
|
||||
fn assert_send<T: Send>() {}
|
||||
assert_send::<DeltaLayer>();
|
||||
assert_send::<DeltaValueIter>();
|
||||
assert_send::<DeltaKeyIter>();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -149,6 +149,7 @@ impl std::fmt::Debug for ImageLayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
@@ -181,18 +182,18 @@ impl Layer for ImageLayer {
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
assert!(self.desc.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
|
||||
let inner = self.load(LayerAccessKind::GetValueReconstructData, &ctx)?;
|
||||
|
||||
let file = inner.file.as_ref().unwrap();
|
||||
let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
|
||||
@@ -210,9 +211,9 @@ impl Layer for ImageLayer {
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
Ok((reconstruct_state, ValueReconstructResult::Missing))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -110,6 +110,7 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for InMemoryLayer {
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
@@ -190,13 +191,13 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
mut reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> anyhow::Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
@@ -213,7 +214,7 @@ impl Layer for InMemoryLayer {
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
return Ok((reconstruct_state, ValueReconstructResult::Complete));
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
@@ -233,9 +234,9 @@ impl Layer for InMemoryLayer {
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
Ok((reconstruct_state, ValueReconstructResult::Continue))
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
Ok((reconstruct_state, ValueReconstructResult::Complete))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -304,7 +305,7 @@ impl InMemoryLayer {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
||||
pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
||||
// TODO: Currently, we just leak the storage for any deleted keys
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -9,10 +9,12 @@ use crate::{context::RequestContext, repository::Key};
|
||||
|
||||
use super::{DeltaFileName, ImageFileName, LayerFileName};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
|
||||
/// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
|
||||
/// a unified way to generate layer information like file name.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct PersistentLayerDesc {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
@@ -50,6 +52,19 @@ impl PersistentLayerDesc {
|
||||
self.filename().file_name()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn new_test(key_range: Range<Key>) -> Self {
|
||||
Self {
|
||||
tenant_id: TenantId::generate(),
|
||||
timeline_id: TimelineId::generate(),
|
||||
key_range,
|
||||
lsn_range: Lsn(0)..Lsn(1),
|
||||
is_delta: false,
|
||||
is_incremental: false,
|
||||
file_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_img(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::layer_map::BatchedUpdates;
|
||||
use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
|
||||
use crate::tenant::storage_layer::{Layer, ValueReconstructState};
|
||||
use anyhow::{bail, Result};
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use std::ops::Range;
|
||||
@@ -21,7 +21,7 @@ use utils::{
|
||||
use super::filename::{DeltaFileName, ImageFileName};
|
||||
use super::{
|
||||
DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, LayerKeyIter,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc,
|
||||
LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, ValueReconstructResult,
|
||||
};
|
||||
|
||||
/// RemoteLayer is a not yet downloaded [`ImageLayer`] or
|
||||
@@ -63,14 +63,15 @@ impl std::fmt::Debug for RemoteLayer {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Layer for RemoteLayer {
|
||||
fn get_value_reconstruct_data(
|
||||
fn get_value_reconstruct_data_blocking(
|
||||
&self,
|
||||
_key: Key,
|
||||
_lsn_range: Range<Lsn>,
|
||||
_reconstruct_state: &mut ValueReconstructState,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<ValueReconstructResult> {
|
||||
_reconstruct_state: ValueReconstructState,
|
||||
_ctx: RequestContext,
|
||||
) -> Result<(ValueReconstructState, ValueReconstructResult)> {
|
||||
bail!(
|
||||
"layer {} needs to be downloaded",
|
||||
self.filename().file_name()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -197,7 +197,7 @@ impl Timeline {
|
||||
// We don't want to hold the layer map lock during eviction.
|
||||
// So, we just need to deal with this.
|
||||
let candidates: Vec<Arc<dyn PersistentLayer>> = {
|
||||
let layers = self.layers.read().unwrap();
|
||||
let layers = self.layers.read().await;
|
||||
let mut candidates = Vec::new();
|
||||
for hist_layer in layers.iter_historic_layers() {
|
||||
if hist_layer.is_remote_layer() {
|
||||
|
||||
@@ -1321,10 +1321,11 @@ mod tests {
|
||||
|
||||
const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
|
||||
|
||||
async fn dummy_state(harness: &TenantHarness<'_>) -> ConnectionManagerState {
|
||||
async fn dummy_state(harness: &TenantHarness) -> ConnectionManagerState {
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(0x8), crate::DEFAULT_PG_VERSION, &ctx)
|
||||
.await
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager");
|
||||
|
||||
ConnectionManagerState {
|
||||
|
||||
@@ -304,12 +304,15 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
}
|
||||
}
|
||||
|
||||
timeline.check_checkpoint_distance().with_context(|| {
|
||||
format!(
|
||||
"Failed to check checkpoint distance for timeline {}",
|
||||
timeline.timeline_id
|
||||
)
|
||||
})?;
|
||||
timeline
|
||||
.check_checkpoint_distance()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to check checkpoint distance for timeline {}",
|
||||
timeline.timeline_id
|
||||
)
|
||||
})?;
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
let timeline_remote_consistent_lsn =
|
||||
|
||||
@@ -25,7 +25,7 @@ use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
|
||||
|
||||
use anyhow::Result;
|
||||
use anyhow::{Context, Result};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
@@ -333,7 +333,7 @@ impl<'a> WalIngest<'a> {
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
modification.commit()?;
|
||||
modification.commit().await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1082,7 +1082,10 @@ impl<'a> WalIngest<'a> {
|
||||
.await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
modification.put_rel_creation(rel, 0, ctx).await?;
|
||||
modification
|
||||
.put_rel_creation(rel, 0, ctx)
|
||||
.await
|
||||
.context("Relation Error")?;
|
||||
0
|
||||
} else {
|
||||
self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
|
||||
@@ -1171,7 +1174,6 @@ impl<'a> WalIngest<'a> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::pgdatadir_mapping::create_test_timeline;
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::Timeline;
|
||||
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
@@ -1200,7 +1202,7 @@ mod tests {
|
||||
let mut m = tline.begin_modification(Lsn(0x10));
|
||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||
m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
|
||||
|
||||
Ok(walingest)
|
||||
@@ -1209,7 +1211,9 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_relsize() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
@@ -1217,22 +1221,22 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x40));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
let mut m = tline.begin_modification(Lsn(0x50));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x50));
|
||||
|
||||
@@ -1318,7 +1322,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_current_logical_size(&tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
@@ -1360,7 +1364,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
|
||||
@@ -1373,7 +1377,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
|
||||
@@ -1398,7 +1402,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
|
||||
@@ -1428,14 +1432,16 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_drop_extend() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
@@ -1454,7 +1460,7 @@ mod tests {
|
||||
// Drop rel
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(
|
||||
@@ -1472,7 +1478,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
@@ -1497,7 +1503,9 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_truncate_extend() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||
|
||||
// Create a 20 MB relation (the size is arbitrary)
|
||||
@@ -1509,7 +1517,7 @@ mod tests {
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// The relation was created at LSN 20, not visible at LSN 1 yet.
|
||||
assert_eq!(
|
||||
@@ -1554,7 +1562,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(
|
||||
@@ -1603,7 +1611,7 @@ mod tests {
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
|
||||
.await?;
|
||||
}
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
@@ -1637,7 +1645,9 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_large_rel() -> Result<()> {
|
||||
let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
|
||||
let tline = tenant
|
||||
.create_test_timeline(TIMELINE_ID, Lsn(8), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let mut walingest = init_walingest_test(&tline, &ctx).await?;
|
||||
|
||||
let mut lsn = 0x10;
|
||||
@@ -1648,7 +1658,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
}
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
@@ -1664,7 +1674,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
||||
RELSEG_SIZE
|
||||
@@ -1677,7 +1687,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
||||
RELSEG_SIZE - 1
|
||||
@@ -1693,7 +1703,7 @@ mod tests {
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
m.commit().await?;
|
||||
assert_eq!(
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
|
||||
size as BlockNumber
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
extension_server.o \
|
||||
file_cache.o \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
|
||||
103
pgxn/neon/extension_server.c
Normal file
103
pgxn/neon/extension_server.c
Normal file
@@ -0,0 +1,103 @@
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* extension_server.c
|
||||
* Request compute_ctl to download extension files.
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/extension_server.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "tcop/pquery.h"
|
||||
#include "tcop/utility.h"
|
||||
#include "access/xact.h"
|
||||
#include "utils/hsearch.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "commands/defrem.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/acl.h"
|
||||
#include "fmgr.h"
|
||||
#include "utils/guc.h"
|
||||
#include "port.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
static int extension_server_port = 0;
|
||||
|
||||
static download_extension_file_hook_type prev_download_extension_file_hook = NULL;
|
||||
|
||||
// to download all SQL files for an extension:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis
|
||||
//
|
||||
// to download specific library file:
|
||||
// curl -X POST http://localhost:8080/extension_server/postgis-3.so?=true
|
||||
static bool
|
||||
neon_download_extension_file_http(const char *filename, bool is_library)
|
||||
{
|
||||
CURL *curl;
|
||||
CURLcode res;
|
||||
char *compute_ctl_url;
|
||||
char *postdata;
|
||||
bool ret = false;
|
||||
|
||||
if ((curl = curl_easy_init()) == NULL)
|
||||
{
|
||||
elog(ERROR, "Failed to initialize curl handle");
|
||||
}
|
||||
|
||||
|
||||
if (is_library)
|
||||
{
|
||||
elog(LOG, "request library");
|
||||
}
|
||||
|
||||
compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s",
|
||||
extension_server_port, filename, is_library?"?is_library=true":"");
|
||||
|
||||
elog(LOG, "curl_easy_perform() url: %s", compute_ctl_url);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST");
|
||||
curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */);
|
||||
|
||||
|
||||
if (curl)
|
||||
{
|
||||
/* Perform the request, res will get the return code */
|
||||
res = curl_easy_perform(curl);
|
||||
/* Check for errors */
|
||||
if (res == CURLE_OK)
|
||||
{
|
||||
elog(LOG, "curl_easy_perform() succeeded");
|
||||
ret = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
elog(WARNING, "curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
|
||||
}
|
||||
|
||||
/* always cleanup */
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pg_init_extension_server()
|
||||
{
|
||||
DefineCustomIntVariable("neon.extension_server_port",
|
||||
"connection string to the compute_ctl",
|
||||
NULL,
|
||||
&extension_server_port,
|
||||
0, 0, INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
// set download_extension_file_hook
|
||||
prev_download_extension_file_hook = download_extension_file_hook;
|
||||
download_extension_file_hook = neon_download_extension_file_http;
|
||||
}
|
||||
1
pgxn/neon/extension_server.h
Normal file
1
pgxn/neon/extension_server.h
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
@@ -35,8 +35,11 @@ _PG_init(void)
|
||||
{
|
||||
pg_init_libpagestore();
|
||||
pg_init_walproposer();
|
||||
|
||||
InitControlPlaneConnector();
|
||||
|
||||
pg_init_extension_server();
|
||||
|
||||
// Important: This must happen after other parts of the extension
|
||||
// are loaded, otherwise any settings to GUCs that were set before
|
||||
// the extension was loaded will be removed.
|
||||
|
||||
@@ -21,6 +21,8 @@ extern char *neon_tenant;
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
extern void pg_init_extension_server(void);
|
||||
|
||||
/*
|
||||
* Returns true if we shouldn't do REDO on that block in record indicated by
|
||||
* block_id; false otherwise.
|
||||
|
||||
@@ -257,7 +257,7 @@ nwp_register_gucs(void)
|
||||
"Walproposer reconnects to offline safekeepers once in this interval.",
|
||||
NULL,
|
||||
&wal_acceptor_reconnect_timeout,
|
||||
5000, 0, INT_MAX, /* default, min, max */
|
||||
1000, 0, INT_MAX, /* default, min, max */
|
||||
PGC_SIGHUP, /* context */
|
||||
GUC_UNIT_MS, /* flags */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
56
poetry.lock
generated
56
poetry.lock
generated
@@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
@@ -855,35 +855,31 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "39.0.1"
|
||||
version = "41.0.0"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:6687ef6d0a6497e2b58e7c5b852b53f62142cfa7cd1555795758934da363a965"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:706843b48f9a3f9b9911979761c91541e3d90db1ca905fd63fee540a217698bc"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:5d2d8b87a490bfcd407ed9d49093793d0f75198a35e6eb1a923ce1ee86c62b41"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83e17b26de248c33f3acffb922748151d71827d6021d98c70e6c1a25ddd78505"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e124352fd3db36a9d4a21c1aa27fd5d051e621845cb87fb851c08f4f75ce8be6"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_24_x86_64.whl", hash = "sha256:5aa67414fcdfa22cf052e640cb5ddc461924a045cacf325cd164e65312d99502"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:35f7c7d015d474f4011e859e93e789c87d21f6f4880ebdc29896a60403328f1f"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f24077a3b5298a5a06a8e0536e3ea9ec60e4c7ac486755e5fb6e6ea9b3500106"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:f0c64d1bd842ca2633e74a1a28033d139368ad959872533b1bab8c80e8240a0c"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:0f8da300b5c8af9f98111ffd512910bc792b4c77392a9523624680f7956a99d4"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-win32.whl", hash = "sha256:fe913f20024eb2cb2f323e42a64bdf2911bb9738a15dba7d3cce48151034e3a8"},
|
||||
{file = "cryptography-39.0.1-cp36-abi3-win_amd64.whl", hash = "sha256:ced4e447ae29ca194449a3f1ce132ded8fcab06971ef5f618605aacaa612beac"},
|
||||
{file = "cryptography-39.0.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:807ce09d4434881ca3a7594733669bd834f5b2c6d5c7e36f8c00f691887042ad"},
|
||||
{file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5caeb8188c24888c90b5108a441c106f7faa4c4c075a2bcae438c6e8ca73cef"},
|
||||
{file = "cryptography-39.0.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4789d1e3e257965e960232345002262ede4d094d1a19f4d3b52e48d4d8f3b885"},
|
||||
{file = "cryptography-39.0.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:96f1157a7c08b5b189b16b47bc9db2332269d6680a196341bf30046330d15388"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e422abdec8b5fa8462aa016786680720d78bdce7a30c652b7fadf83a4ba35336"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b0afd054cd42f3d213bf82c629efb1ee5f22eba35bf0eec88ea9ea7304f511a2"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:6f8ba7f0328b79f08bdacc3e4e66fb4d7aab0c3584e0bd41328dce5262e26b2e"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ef8b72fa70b348724ff1218267e7f7375b8de4e8194d1636ee60510aae104cd0"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:aec5a6c9864be7df2240c382740fcf3b96928c46604eaa7f3091f58b878c0bb6"},
|
||||
{file = "cryptography-39.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdd188c8a6ef8769f148f88f859884507b954cc64db6b52f66ef199bb9ad660a"},
|
||||
{file = "cryptography-39.0.1.tar.gz", hash = "sha256:d1f6198ee6d9148405e49887803907fe8962a23e6c6f83ea7d98f1c0de375695"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:3c5ef25d060c80d6d9f7f9892e1d41bb1c79b78ce74805b8cb4aa373cb7d5ec8"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8362565b3835ceacf4dc8f3b56471a2289cf51ac80946f9087e66dc283a810e0"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3680248309d340fda9611498a5319b0193a8dbdb73586a1acf8109d06f25b92d"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a165379cb9d411d58ed739e4af3396e544eac190805a54ba2e0322feb55c46"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:4ab14d567f7bbe7f1cdff1c53d5324ed4d3fc8bd17c481b395db224fb405c237"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:9f65e842cb02550fac96536edb1d17f24c0a338fd84eaf582be25926e993dde4"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:b7f2f5c525a642cecad24ee8670443ba27ac1fab81bba4cc24c7b6b41f2d0c75"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7d92f0248d38faa411d17f4107fc0bce0c42cae0b0ba5415505df72d751bf62d"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-win32.whl", hash = "sha256:34d405ea69a8b34566ba3dfb0521379b210ea5d560fafedf9f800a9a94a41928"},
|
||||
{file = "cryptography-41.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:344c6de9f8bda3c425b3a41b319522ba3208551b70c2ae00099c205f0d9fd3be"},
|
||||
{file = "cryptography-41.0.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:88ff107f211ea696455ea8d911389f6d2b276aabf3231bf72c8853d22db755c5"},
|
||||
{file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b846d59a8d5a9ba87e2c3d757ca019fa576793e8758174d3868aecb88d6fc8eb"},
|
||||
{file = "cryptography-41.0.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f5d0bf9b252f30a31664b6f64432b4730bb7038339bd18b1fafe129cfc2be9be"},
|
||||
{file = "cryptography-41.0.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5c1f7293c31ebc72163a9a0df246f890d65f66b4a40d9ec80081969ba8c78cc9"},
|
||||
{file = "cryptography-41.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bf8fc66012ca857d62f6a347007e166ed59c0bc150cefa49f28376ebe7d992a2"},
|
||||
{file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a4fc68d1c5b951cfb72dfd54702afdbbf0fb7acdc9b7dc4301bbf2225a27714d"},
|
||||
{file = "cryptography-41.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:14754bcdae909d66ff24b7b5f166d69340ccc6cb15731670435efd5719294895"},
|
||||
{file = "cryptography-41.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0ddaee209d1cf1f180f1efa338a68c4621154de0afaef92b89486f5f96047c55"},
|
||||
{file = "cryptography-41.0.0.tar.gz", hash = "sha256:6b71f64beeea341c9b4f963b48ee3b62d62d57ba93eb120e1196b31dc1025e78"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -892,12 +888,12 @@ cffi = ">=1.12"
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
|
||||
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
|
||||
pep8test = ["black", "check-manifest", "mypy", "ruff", "types-pytz", "types-requests"]
|
||||
sdist = ["setuptools-rust (>=0.11.4)"]
|
||||
nox = ["nox"]
|
||||
pep8test = ["black", "check-sdist", "mypy", "ruff"]
|
||||
sdist = ["build"]
|
||||
ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["hypothesis (>=1.11.4,!=3.79.2)", "iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist", "pytz"]
|
||||
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
tox = ["tox"]
|
||||
|
||||
[[package]]
|
||||
name = "docker"
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use futures::pin_mut;
|
||||
use futures::StreamExt;
|
||||
use futures::TryFutureExt;
|
||||
use hyper::body::HttpBody;
|
||||
use hyper::http::HeaderName;
|
||||
use hyper::http::HeaderValue;
|
||||
@@ -11,8 +12,13 @@ use serde_json::Value;
|
||||
use tokio_postgres::types::Kind;
|
||||
use tokio_postgres::types::Type;
|
||||
use tokio_postgres::Row;
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
|
||||
use crate::proxy::invalidate_cache;
|
||||
use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
|
||||
use crate::{auth, config::ProxyConfig, console};
|
||||
|
||||
#[derive(serde::Deserialize)]
|
||||
@@ -90,10 +96,17 @@ fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::E
|
||||
}
|
||||
}
|
||||
|
||||
struct ConnInfo {
|
||||
username: String,
|
||||
dbname: String,
|
||||
hostname: String,
|
||||
password: String,
|
||||
}
|
||||
|
||||
fn get_conn_info(
|
||||
headers: &HeaderMap,
|
||||
sni_hostname: Option<String>,
|
||||
) -> Result<(String, String, String, String), anyhow::Error> {
|
||||
) -> Result<ConnInfo, anyhow::Error> {
|
||||
let connection_string = headers
|
||||
.get("Neon-Connection-String")
|
||||
.ok_or(anyhow::anyhow!("missing connection string"))?
|
||||
@@ -146,12 +159,12 @@ fn get_conn_info(
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
username.to_owned(),
|
||||
dbname.to_owned(),
|
||||
hostname.to_owned(),
|
||||
password.to_owned(),
|
||||
))
|
||||
Ok(ConnInfo {
|
||||
username: username.to_owned(),
|
||||
dbname: dbname.to_owned(),
|
||||
hostname: hostname.to_owned(),
|
||||
password: password.to_owned(),
|
||||
})
|
||||
}
|
||||
|
||||
// TODO: return different http error codes
|
||||
@@ -164,10 +177,10 @@ pub async fn handle(
|
||||
// Determine the destination and connection params
|
||||
//
|
||||
let headers = request.headers();
|
||||
let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
|
||||
let conn_info = get_conn_info(headers, sni_hostname)?;
|
||||
let credential_params = StartupMessageParams::new([
|
||||
("user", &username),
|
||||
("database", &dbname),
|
||||
("user", &conn_info.username),
|
||||
("database", &conn_info.dbname),
|
||||
("application_name", APP_NAME),
|
||||
]);
|
||||
|
||||
@@ -186,21 +199,20 @@ pub async fn handle(
|
||||
let creds = config
|
||||
.auth_backend
|
||||
.as_ref()
|
||||
.map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
|
||||
.map(|_| {
|
||||
auth::ClientCredentials::parse(
|
||||
&credential_params,
|
||||
Some(&conn_info.hostname),
|
||||
common_names,
|
||||
)
|
||||
})
|
||||
.transpose()?;
|
||||
let extra = console::ConsoleReqExtra {
|
||||
session_id: uuid::Uuid::new_v4(),
|
||||
application_name: Some(APP_NAME),
|
||||
};
|
||||
let node = creds.wake_compute(&extra).await?.expect("msg");
|
||||
let conf = node.value.config;
|
||||
let port = *conf.get_ports().first().expect("no port");
|
||||
let host = match conf.get_hosts().first().expect("no host") {
|
||||
tokio_postgres::config::Host::Tcp(host) => host,
|
||||
tokio_postgres::config::Host::Unix(_) => {
|
||||
return Err(anyhow::anyhow!("unix socket is not supported"));
|
||||
}
|
||||
};
|
||||
|
||||
let mut node_info = creds.wake_compute(&extra).await?.expect("msg");
|
||||
|
||||
let request_content_length = match request.body().size_hint().upper() {
|
||||
Some(v) => v,
|
||||
@@ -220,28 +232,10 @@ pub async fn handle(
|
||||
let QueryData { query, params } = serde_json::from_slice(&body)?;
|
||||
let query_params = json_to_pg_text(params)?;
|
||||
|
||||
//
|
||||
// Connenct to the destination
|
||||
//
|
||||
let (client, connection) = tokio_postgres::Config::new()
|
||||
.host(host)
|
||||
.port(port)
|
||||
.user(&username)
|
||||
.password(&password)
|
||||
.dbname(&dbname)
|
||||
.max_backend_message_size(MAX_RESPONSE_SIZE)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.await?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
eprintln!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
//
|
||||
// Now execute the query and return the result
|
||||
//
|
||||
let client = connect_to_compute(&mut node_info, &extra, &creds, &conn_info).await?;
|
||||
let row_stream = client.query_raw_txt(query, query_params).await?;
|
||||
|
||||
// Manually drain the stream into a vector to leave row_stream hanging
|
||||
@@ -280,6 +274,11 @@ pub async fn handle(
|
||||
json!({
|
||||
"name": Value::String(c.name().to_owned()),
|
||||
"dataTypeID": Value::Number(c.type_().oid().into()),
|
||||
"tableID": c.table_oid(),
|
||||
"columnID": c.column_id(),
|
||||
"dataTypeSize": c.type_size(),
|
||||
"dataTypeModifier": c.type_modifier(),
|
||||
"format": "text",
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
@@ -303,6 +302,70 @@ pub async fn handle(
|
||||
}))
|
||||
}
|
||||
|
||||
/// This function is a copy of `connect_to_compute` from `src/proxy.rs` with
|
||||
/// the difference that it uses `tokio_postgres` for the connection.
|
||||
#[instrument(skip_all)]
|
||||
async fn connect_to_compute(
|
||||
node_info: &mut console::CachedNodeInfo,
|
||||
extra: &console::ConsoleReqExtra<'_>,
|
||||
creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
|
||||
conn_info: &ConnInfo,
|
||||
) -> anyhow::Result<tokio_postgres::Client> {
|
||||
let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
|
||||
|
||||
loop {
|
||||
match connect_to_compute_once(node_info, conn_info).await {
|
||||
Err(e) if num_retries > 0 => {
|
||||
info!("compute node's state has changed; requesting a wake-up");
|
||||
match creds.wake_compute(extra).await? {
|
||||
// Update `node_info` and try one more time.
|
||||
Some(new) => {
|
||||
*node_info = new;
|
||||
}
|
||||
// Link auth doesn't work that way, so we just exit.
|
||||
None => return Err(e),
|
||||
}
|
||||
}
|
||||
other => return other,
|
||||
}
|
||||
|
||||
num_retries -= 1;
|
||||
info!("retrying after wake-up ({num_retries} attempts left)");
|
||||
}
|
||||
}
|
||||
|
||||
async fn connect_to_compute_once(
|
||||
node_info: &console::CachedNodeInfo,
|
||||
conn_info: &ConnInfo,
|
||||
) -> anyhow::Result<tokio_postgres::Client> {
|
||||
let mut config = (*node_info.config).clone();
|
||||
|
||||
let (client, connection) = config
|
||||
.user(&conn_info.username)
|
||||
.password(&conn_info.password)
|
||||
.dbname(&conn_info.dbname)
|
||||
.max_backend_message_size(MAX_RESPONSE_SIZE)
|
||||
.connect(tokio_postgres::NoTls)
|
||||
.inspect_err(|e: &tokio_postgres::Error| {
|
||||
error!(
|
||||
"failed to connect to compute node hosts={:?} ports={:?}: {}",
|
||||
node_info.config.get_hosts(),
|
||||
node_info.config.get_ports(),
|
||||
e
|
||||
);
|
||||
invalidate_cache(node_info)
|
||||
})
|
||||
.await?;
|
||||
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
Ok(client)
|
||||
}
|
||||
|
||||
//
|
||||
// Convert postgres row with text-encoded values to JSON object
|
||||
//
|
||||
|
||||
@@ -26,7 +26,6 @@ use tls_listener::TlsListener;
|
||||
use tokio::{
|
||||
io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
|
||||
net::TcpListener,
|
||||
select,
|
||||
};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info, info_span, warn, Instrument};
|
||||
@@ -193,14 +192,9 @@ async fn ws_handler(
|
||||
// TODO: that deserves a refactor as now this function also handles http json client besides websockets.
|
||||
// Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
|
||||
} else if request.uri().path() == "/sql" && request.method() == Method::POST {
|
||||
let result = select! {
|
||||
_ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
|
||||
Err(anyhow::anyhow!("Query timed out"))
|
||||
}
|
||||
response = sql_over_http::handle(config, request, sni_hostname) => {
|
||||
response
|
||||
}
|
||||
};
|
||||
let result = sql_over_http::handle(config, request, sni_hostname)
|
||||
.instrument(info_span!("sql-over-http"))
|
||||
.await;
|
||||
let status_code = match result {
|
||||
Ok(_) => StatusCode::OK,
|
||||
Err(_) => StatusCode::BAD_REQUEST,
|
||||
|
||||
@@ -22,7 +22,7 @@ use tracing::{error, info, warn};
|
||||
use utils::measured_stream::MeasuredStream;
|
||||
|
||||
/// Number of times we should retry the `/proxy_wake_compute` http request.
|
||||
const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
|
||||
pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
|
||||
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
const ERR_PROTO_VIOLATION: &str = "protocol violation";
|
||||
@@ -283,34 +283,35 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
}
|
||||
|
||||
/// If we couldn't connect, a cached connection info might be to blame
|
||||
/// (e.g. the compute node's address might've changed at the wrong time).
|
||||
/// Invalidate the cache entry (if any) to prevent subsequent errors.
|
||||
#[tracing::instrument(name = "invalidate_cache", skip_all)]
|
||||
pub fn invalidate_cache(node_info: &console::CachedNodeInfo) {
|
||||
let is_cached = node_info.cached();
|
||||
if is_cached {
|
||||
warn!("invalidating stalled compute node info cache entry");
|
||||
node_info.invalidate();
|
||||
}
|
||||
|
||||
let label = match is_cached {
|
||||
true => "compute_cached",
|
||||
false => "compute_uncached",
|
||||
};
|
||||
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
|
||||
}
|
||||
|
||||
/// Try to connect to the compute node once.
|
||||
#[tracing::instrument(name = "connect_once", skip_all)]
|
||||
async fn connect_to_compute_once(
|
||||
node_info: &console::CachedNodeInfo,
|
||||
) -> Result<PostgresConnection, compute::ConnectionError> {
|
||||
// If we couldn't connect, a cached connection info might be to blame
|
||||
// (e.g. the compute node's address might've changed at the wrong time).
|
||||
// Invalidate the cache entry (if any) to prevent subsequent errors.
|
||||
let invalidate_cache = |_: &compute::ConnectionError| {
|
||||
let is_cached = node_info.cached();
|
||||
if is_cached {
|
||||
warn!("invalidating stalled compute node info cache entry");
|
||||
node_info.invalidate();
|
||||
}
|
||||
|
||||
let label = match is_cached {
|
||||
true => "compute_cached",
|
||||
false => "compute_uncached",
|
||||
};
|
||||
NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
|
||||
};
|
||||
|
||||
let allow_self_signed_compute = node_info.allow_self_signed_compute;
|
||||
|
||||
node_info
|
||||
.config
|
||||
.connect(allow_self_signed_compute)
|
||||
.inspect_err(invalidate_cache)
|
||||
.inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
|
||||
.await
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.68.2"
|
||||
channel = "1.70.0"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -3,15 +3,19 @@
|
||||
//
|
||||
use anyhow::{bail, Context, Result};
|
||||
use clap::Parser;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{FutureExt, StreamExt};
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use tokio::runtime::Handle;
|
||||
use tokio::signal::unix::{signal, SignalKind};
|
||||
use tokio::task::JoinError;
|
||||
use toml_edit::Document;
|
||||
use utils::signals::ShutdownSignals;
|
||||
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use storage_broker::Uri;
|
||||
use tokio::sync::mpsc;
|
||||
@@ -20,22 +24,21 @@ use tracing::*;
|
||||
use utils::pid_file;
|
||||
|
||||
use metrics::set_build_info_metric;
|
||||
use safekeeper::broker;
|
||||
use safekeeper::control_file;
|
||||
use safekeeper::defaults::{
|
||||
DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
DEFAULT_PG_LISTEN_ADDR,
|
||||
};
|
||||
use safekeeper::http;
|
||||
use safekeeper::remove_wal;
|
||||
use safekeeper::wal_backup;
|
||||
use safekeeper::wal_service;
|
||||
use safekeeper::GlobalTimelines;
|
||||
use safekeeper::SafeKeeperConf;
|
||||
use safekeeper::{broker, WAL_SERVICE_RUNTIME};
|
||||
use safekeeper::{control_file, BROKER_RUNTIME};
|
||||
use safekeeper::{http, WAL_REMOVER_RUNTIME};
|
||||
use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
|
||||
use safekeeper::{wal_backup, HTTP_RUNTIME};
|
||||
use storage_broker::DEFAULT_ENDPOINT;
|
||||
use utils::auth::JwtAuth;
|
||||
use utils::{
|
||||
http::endpoint,
|
||||
id::NodeId,
|
||||
logging::{self, LogFormat},
|
||||
project_git_version,
|
||||
@@ -104,10 +107,6 @@ struct Args {
|
||||
/// Safekeeper won't be elected for WAL offloading if it is lagging for more than this value in bytes
|
||||
#[arg(long, default_value_t = DEFAULT_MAX_OFFLOADER_LAG_BYTES)]
|
||||
max_offloader_lag: u64,
|
||||
/// Number of threads for wal backup runtime, by default number of cores
|
||||
/// available to the system.
|
||||
#[arg(long)]
|
||||
wal_backup_threads: Option<usize>,
|
||||
/// Number of max parallel WAL segments to be offloaded to remote storage.
|
||||
#[arg(long, default_value = "5")]
|
||||
wal_backup_parallel_jobs: usize,
|
||||
@@ -121,9 +120,14 @@ struct Args {
|
||||
/// Format for logging, either 'plain' or 'json'.
|
||||
#[arg(long, default_value = "plain")]
|
||||
log_format: String,
|
||||
/// Run everything in single threaded current thread runtime, might be
|
||||
/// useful for debugging.
|
||||
#[arg(long)]
|
||||
current_thread_runtime: bool,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let args = Args::parse();
|
||||
|
||||
if let Some(addr) = args.dump_control_file {
|
||||
@@ -183,10 +187,10 @@ fn main() -> anyhow::Result<()> {
|
||||
heartbeat_timeout: args.heartbeat_timeout,
|
||||
remote_storage: args.remote_storage,
|
||||
max_offloader_lag_bytes: args.max_offloader_lag,
|
||||
backup_runtime_threads: args.wal_backup_threads,
|
||||
wal_backup_enabled: !args.disable_wal_backup,
|
||||
backup_parallel_jobs: args.wal_backup_parallel_jobs,
|
||||
auth,
|
||||
current_thread_runtime: args.current_thread_runtime,
|
||||
};
|
||||
|
||||
// initialize sentry if SENTRY_DSN is provided
|
||||
@@ -194,10 +198,14 @@ fn main() -> anyhow::Result<()> {
|
||||
Some(GIT_VERSION.into()),
|
||||
&[("node_id", &conf.my_id.to_string())],
|
||||
);
|
||||
start_safekeeper(conf)
|
||||
start_safekeeper(conf).await
|
||||
}
|
||||
|
||||
fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
/// Result of joining any of main tasks: upper error means task failed to
|
||||
/// complete, e.g. panicked, inner is error produced by task itself.
|
||||
type JoinTaskRes = Result<anyhow::Result<()>, JoinError>;
|
||||
|
||||
async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
// Prevent running multiple safekeepers on the same directory
|
||||
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
|
||||
let lock_file =
|
||||
@@ -208,14 +216,18 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
// we need to release the lock file only when the current process is gone
|
||||
std::mem::forget(lock_file);
|
||||
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
info!("starting safekeeper WAL service on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
info!("starting safekeeper on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
info!(
|
||||
"starting safekeeper HTTP service on {}",
|
||||
conf.listen_http_addr
|
||||
);
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
@@ -224,71 +236,88 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
let mut threads = vec![];
|
||||
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||
|
||||
// Load all timelines from disk to memory.
|
||||
GlobalTimelines::init(conf.clone(), wal_backup_launcher_tx)?;
|
||||
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(|| {
|
||||
let router = http::make_router(conf_);
|
||||
endpoint::serve_thread_main(
|
||||
router,
|
||||
http_listener,
|
||||
std::future::pending(), // never shut down
|
||||
)
|
||||
.unwrap();
|
||||
})?,
|
||||
);
|
||||
|
||||
let conf_cloned = conf.clone();
|
||||
let safekeeper_thread = thread::Builder::new()
|
||||
.name("WAL service thread".into())
|
||||
.spawn(|| wal_service::thread_main(conf_cloned, pg_listener))
|
||||
.unwrap();
|
||||
|
||||
threads.push(safekeeper_thread);
|
||||
// Keep handles to main tasks to die if any of them disappears.
|
||||
let mut tasks_handles: FuturesUnordered<BoxFuture<(String, JoinTaskRes)>> =
|
||||
FuturesUnordered::new();
|
||||
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("broker thread".into())
|
||||
.spawn(|| {
|
||||
broker::thread_main(conf_);
|
||||
})?,
|
||||
);
|
||||
// Run everything in current thread rt, if asked.
|
||||
if conf.current_thread_runtime {
|
||||
info!("running in current thread runtime");
|
||||
}
|
||||
let current_thread_rt = conf
|
||||
.current_thread_runtime
|
||||
.then(|| Handle::try_current().expect("no runtime in main"));
|
||||
let wal_service_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
|
||||
.spawn(wal_service::task_main(conf_, pg_listener))
|
||||
// wrap with task name for error reporting
|
||||
.map(|res| ("WAL service main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_service_handle));
|
||||
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("WAL removal thread".into())
|
||||
.spawn(|| {
|
||||
remove_wal::thread_main(conf_);
|
||||
})?,
|
||||
);
|
||||
let http_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| HTTP_RUNTIME.handle())
|
||||
.spawn(http::task_main(conf_, http_listener))
|
||||
.map(|res| ("HTTP service main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(http_handle));
|
||||
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("WAL backup launcher thread".into())
|
||||
.spawn(move || {
|
||||
wal_backup::wal_backup_launcher_thread_main(conf, wal_backup_launcher_rx);
|
||||
})?,
|
||||
);
|
||||
let conf_ = conf.clone();
|
||||
let broker_task_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| BROKER_RUNTIME.handle())
|
||||
.spawn(broker::task_main(conf_).instrument(info_span!("broker")))
|
||||
.map(|res| ("broker main".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(broker_task_handle));
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let wal_remover_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_REMOVER_RUNTIME.handle())
|
||||
.spawn(remove_wal::task_main(conf_))
|
||||
.map(|res| ("WAL remover".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_remover_handle));
|
||||
|
||||
let conf_ = conf.clone();
|
||||
let wal_backup_handle = current_thread_rt
|
||||
.as_ref()
|
||||
.unwrap_or_else(|| WAL_BACKUP_RUNTIME.handle())
|
||||
.spawn(wal_backup::wal_backup_launcher_task_main(
|
||||
conf_,
|
||||
wal_backup_launcher_rx,
|
||||
))
|
||||
.map(|res| ("WAL backup launcher".to_owned(), res));
|
||||
tasks_handles.push(Box::pin(wal_backup_handle));
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
// TODO: put more thoughts into handling of failed threads
|
||||
// We should catch & die if they are in trouble.
|
||||
|
||||
// On any shutdown signal, log receival and exit. Additionally, handling
|
||||
// SIGQUIT prevents coredump.
|
||||
ShutdownSignals::handle(|signal| {
|
||||
info!("received {}, terminating", signal.name());
|
||||
std::process::exit(0);
|
||||
})
|
||||
// TODO: update tokio-stream, convert to real async Stream with
|
||||
// SignalStream, map it to obtain missing signal name, combine streams into
|
||||
// single stream we can easily sit on.
|
||||
let mut sigquit_stream = signal(SignalKind::quit())?;
|
||||
let mut sigint_stream = signal(SignalKind::interrupt())?;
|
||||
let mut sigterm_stream = signal(SignalKind::terminate())?;
|
||||
|
||||
tokio::select! {
|
||||
Some((task_name, res)) = tasks_handles.next()=> {
|
||||
error!("{} task failed: {:?}, exiting", task_name, res);
|
||||
std::process::exit(1);
|
||||
}
|
||||
// On any shutdown signal, log receival and exit. Additionally, handling
|
||||
// SIGQUIT prevents coredump.
|
||||
_ = sigquit_stream.recv() => info!("received SIGQUIT, terminating"),
|
||||
_ = sigint_stream.recv() => info!("received SIGINT, terminating"),
|
||||
_ = sigterm_stream.recv() => info!("received SIGTERM, terminating")
|
||||
|
||||
};
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
/// Determine safekeeper id.
|
||||
|
||||
@@ -8,7 +8,7 @@ use anyhow::Error;
|
||||
use anyhow::Result;
|
||||
|
||||
use storage_broker::parse_proto_ttid;
|
||||
use storage_broker::proto::broker_service_client::BrokerServiceClient;
|
||||
|
||||
use storage_broker::proto::subscribe_safekeeper_info_request::SubscriptionKey as ProtoSubscriptionKey;
|
||||
use storage_broker::proto::SubscribeSafekeeperInfoRequest;
|
||||
use storage_broker::Request;
|
||||
@@ -16,7 +16,7 @@ use storage_broker::Request;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::{runtime, time::sleep};
|
||||
use tokio::time::sleep;
|
||||
use tracing::*;
|
||||
|
||||
use crate::metrics::BROKER_ITERATION_TIMELINES;
|
||||
@@ -29,23 +29,10 @@ use crate::SafeKeeperConf;
|
||||
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||
|
||||
pub fn thread_main(conf: SafeKeeperConf) {
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let _enter = info_span!("broker").entered();
|
||||
info!("started, broker endpoint {:?}", conf.broker_endpoint);
|
||||
|
||||
runtime.block_on(async {
|
||||
main_loop(conf).await;
|
||||
});
|
||||
}
|
||||
|
||||
/// Push once in a while data about all active timelines to the broker.
|
||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let mut client = BrokerServiceClient::connect(conf.broker_endpoint.clone()).await?;
|
||||
let mut client =
|
||||
storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)?;
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
|
||||
let outbound = async_stream::stream! {
|
||||
@@ -55,20 +42,27 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
// sensitive and there is no risk of deadlock as we don't await while
|
||||
// lock is held.
|
||||
let now = Instant::now();
|
||||
let mut active_tlis = GlobalTimelines::get_all();
|
||||
active_tlis.retain(|tli| tli.is_active());
|
||||
for tli in &active_tlis {
|
||||
let sk_info = tli.get_safekeeper_info(&conf);
|
||||
let all_tlis = GlobalTimelines::get_all();
|
||||
let mut n_pushed_tlis = 0;
|
||||
for tli in &all_tlis {
|
||||
// filtering alternative futures::stream::iter(all_tlis)
|
||||
// .filter(|tli| {let tli = tli.clone(); async move { tli.is_active().await}}).collect::<Vec<_>>().await;
|
||||
// doesn't look better, and I'm not sure how to do that without collect.
|
||||
if !tli.is_active().await {
|
||||
continue;
|
||||
}
|
||||
let sk_info = tli.get_safekeeper_info(&conf).await;
|
||||
yield sk_info;
|
||||
BROKER_PUSHED_UPDATES.inc();
|
||||
n_pushed_tlis += 1;
|
||||
}
|
||||
let elapsed = now.elapsed();
|
||||
|
||||
BROKER_PUSH_ALL_UPDATES_SECONDS.observe(elapsed.as_secs_f64());
|
||||
BROKER_ITERATION_TIMELINES.observe(active_tlis.len() as f64);
|
||||
BROKER_ITERATION_TIMELINES.observe(n_pushed_tlis as f64);
|
||||
|
||||
if elapsed > push_interval / 2 {
|
||||
info!("broker push is too long, pushed {} timeline updates to broker in {:?}", active_tlis.len(), elapsed);
|
||||
info!("broker push is too long, pushed {} timeline updates to broker in {:?}", n_pushed_tlis, elapsed);
|
||||
}
|
||||
|
||||
sleep(push_interval).await;
|
||||
@@ -125,10 +119,13 @@ async fn pull_loop(conf: SafeKeeperConf) -> Result<()> {
|
||||
bail!("end of stream");
|
||||
}
|
||||
|
||||
async fn main_loop(conf: SafeKeeperConf) {
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
info!("started, broker endpoint {:?}", conf.broker_endpoint);
|
||||
|
||||
let mut ticker = tokio::time::interval(Duration::from_millis(RETRY_INTERVAL_MSEC));
|
||||
let mut push_handle: Option<JoinHandle<Result<(), Error>>> = None;
|
||||
let mut pull_handle: Option<JoinHandle<Result<(), Error>>> = None;
|
||||
|
||||
// Selecting on JoinHandles requires some squats; is there a better way to
|
||||
// reap tasks individually?
|
||||
|
||||
|
||||
@@ -2,9 +2,10 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use tokio::fs::{self, File};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
|
||||
use std::fs::{self, File, OpenOptions};
|
||||
use std::io::{Read, Write};
|
||||
use std::io::Read;
|
||||
use std::ops::Deref;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
@@ -26,9 +27,10 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
|
||||
/// Storage should keep actual state inside of it. It should implement Deref
|
||||
/// trait to access state fields and have persist method for updating that state.
|
||||
#[async_trait::async_trait]
|
||||
pub trait Storage: Deref<Target = SafeKeeperState> {
|
||||
/// Persist safekeeper state on disk and update internal state.
|
||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
|
||||
async fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
|
||||
|
||||
/// Timestamp of last persist.
|
||||
fn last_persist_at(&self) -> Instant;
|
||||
@@ -82,7 +84,7 @@ impl FileStorage {
|
||||
/// Check the magic/version in the on-disk data and deserialize it, if possible.
|
||||
fn deser_sk_state(buf: &mut &[u8]) -> Result<SafeKeeperState> {
|
||||
// Read the version independent part
|
||||
let magic = buf.read_u32::<LittleEndian>()?;
|
||||
let magic = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
|
||||
if magic != SK_MAGIC {
|
||||
bail!(
|
||||
"bad control file magic: {:X}, expected {:X}",
|
||||
@@ -90,7 +92,7 @@ impl FileStorage {
|
||||
SK_MAGIC
|
||||
);
|
||||
}
|
||||
let version = buf.read_u32::<LittleEndian>()?;
|
||||
let version = ReadBytesExt::read_u32::<LittleEndian>(buf)?;
|
||||
if version == SK_FORMAT_VERSION {
|
||||
let res = SafeKeeperState::des(buf)?;
|
||||
return Ok(res);
|
||||
@@ -110,7 +112,7 @@ impl FileStorage {
|
||||
|
||||
/// Read in the control file.
|
||||
pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
|
||||
let mut control_file = OpenOptions::new()
|
||||
let mut control_file = std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.open(&control_file_path)
|
||||
@@ -159,30 +161,31 @@ impl Deref for FileStorage {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Storage for FileStorage {
|
||||
/// persists state durably to underlying storage
|
||||
/// for description see https://lwn.net/Articles/457667/
|
||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
||||
async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
||||
let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();
|
||||
|
||||
// write data to safekeeper.control.partial
|
||||
let control_partial_path = self.timeline_dir.join(CONTROL_FILE_NAME_PARTIAL);
|
||||
let mut control_partial = File::create(&control_partial_path).with_context(|| {
|
||||
let mut control_partial = File::create(&control_partial_path).await.with_context(|| {
|
||||
format!(
|
||||
"failed to create partial control file at: {}",
|
||||
&control_partial_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
buf.write_u32::<LittleEndian>(SK_MAGIC)?;
|
||||
buf.write_u32::<LittleEndian>(SK_FORMAT_VERSION)?;
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
|
||||
s.ser_into(&mut buf)?;
|
||||
|
||||
// calculate checksum before resize
|
||||
let checksum = crc32c::crc32c(&buf);
|
||||
buf.extend_from_slice(&checksum.to_le_bytes());
|
||||
|
||||
control_partial.write_all(&buf).with_context(|| {
|
||||
control_partial.write_all(&buf).await.with_context(|| {
|
||||
format!(
|
||||
"failed to write safekeeper state into control file at: {}",
|
||||
control_partial_path.display()
|
||||
@@ -191,7 +194,7 @@ impl Storage for FileStorage {
|
||||
|
||||
// fsync the file
|
||||
if !self.conf.no_sync {
|
||||
control_partial.sync_all().with_context(|| {
|
||||
control_partial.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to sync partial control file at {}",
|
||||
control_partial_path.display()
|
||||
@@ -202,21 +205,22 @@ impl Storage for FileStorage {
|
||||
let control_path = self.timeline_dir.join(CONTROL_FILE_NAME);
|
||||
|
||||
// rename should be atomic
|
||||
fs::rename(&control_partial_path, &control_path)?;
|
||||
fs::rename(&control_partial_path, &control_path).await?;
|
||||
// this sync is not required by any standard but postgres does this (see durable_rename)
|
||||
if !self.conf.no_sync {
|
||||
File::open(&control_path)
|
||||
.and_then(|f| f.sync_all())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to sync control file at: {}",
|
||||
&control_path.display()
|
||||
)
|
||||
})?;
|
||||
let new_f = File::open(&control_path).await?;
|
||||
new_f.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to sync control file at: {}",
|
||||
&control_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// fsync the directory (linux specific)
|
||||
File::open(&self.timeline_dir)
|
||||
.and_then(|f| f.sync_all())
|
||||
let tli_dir = File::open(&self.timeline_dir).await?;
|
||||
tli_dir
|
||||
.sync_all()
|
||||
.await
|
||||
.context("failed to sync control file directory")?;
|
||||
}
|
||||
|
||||
@@ -236,7 +240,6 @@ mod test {
|
||||
use super::*;
|
||||
use crate::{safekeeper::SafeKeeperState, SafeKeeperConf};
|
||||
use anyhow::Result;
|
||||
use std::fs;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
fn stub_conf() -> SafeKeeperConf {
|
||||
@@ -247,59 +250,75 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
fn load_from_control_file(
|
||||
async fn load_from_control_file(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: &TenantTimelineId,
|
||||
) -> Result<(FileStorage, SafeKeeperState)> {
|
||||
fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
|
||||
fs::create_dir_all(conf.timeline_dir(ttid))
|
||||
.await
|
||||
.expect("failed to create timeline dir");
|
||||
Ok((
|
||||
FileStorage::restore_new(ttid, conf)?,
|
||||
FileStorage::load_control_file_conf(conf, ttid)?,
|
||||
))
|
||||
}
|
||||
|
||||
fn create(
|
||||
async fn create(
|
||||
conf: &SafeKeeperConf,
|
||||
ttid: &TenantTimelineId,
|
||||
) -> Result<(FileStorage, SafeKeeperState)> {
|
||||
fs::create_dir_all(conf.timeline_dir(ttid)).expect("failed to create timeline dir");
|
||||
fs::create_dir_all(conf.timeline_dir(ttid))
|
||||
.await
|
||||
.expect("failed to create timeline dir");
|
||||
let state = SafeKeeperState::empty();
|
||||
let storage = FileStorage::create_new(ttid, conf, state.clone())?;
|
||||
Ok((storage, state))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_write_safekeeper_state() {
|
||||
#[tokio::test]
|
||||
async fn test_read_write_safekeeper_state() {
|
||||
let conf = stub_conf();
|
||||
let ttid = TenantTimelineId::generate();
|
||||
{
|
||||
let (mut storage, mut state) = create(&conf, &ttid).expect("failed to create state");
|
||||
let (mut storage, mut state) =
|
||||
create(&conf, &ttid).await.expect("failed to create state");
|
||||
// change something
|
||||
state.commit_lsn = Lsn(42);
|
||||
storage.persist(&state).expect("failed to persist state");
|
||||
storage
|
||||
.persist(&state)
|
||||
.await
|
||||
.expect("failed to persist state");
|
||||
}
|
||||
|
||||
let (_, state) = load_from_control_file(&conf, &ttid).expect("failed to read state");
|
||||
let (_, state) = load_from_control_file(&conf, &ttid)
|
||||
.await
|
||||
.expect("failed to read state");
|
||||
assert_eq!(state.commit_lsn, Lsn(42));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_safekeeper_state_checksum_mismatch() {
|
||||
#[tokio::test]
|
||||
async fn test_safekeeper_state_checksum_mismatch() {
|
||||
let conf = stub_conf();
|
||||
let ttid = TenantTimelineId::generate();
|
||||
{
|
||||
let (mut storage, mut state) = create(&conf, &ttid).expect("failed to read state");
|
||||
let (mut storage, mut state) =
|
||||
create(&conf, &ttid).await.expect("failed to read state");
|
||||
|
||||
// change something
|
||||
state.commit_lsn = Lsn(42);
|
||||
storage.persist(&state).expect("failed to persist state");
|
||||
storage
|
||||
.persist(&state)
|
||||
.await
|
||||
.expect("failed to persist state");
|
||||
}
|
||||
let control_path = conf.timeline_dir(&ttid).join(CONTROL_FILE_NAME);
|
||||
let mut data = fs::read(&control_path).unwrap();
|
||||
let mut data = fs::read(&control_path).await.unwrap();
|
||||
data[0] += 1; // change the first byte of the file to fail checksum validation
|
||||
fs::write(&control_path, &data).expect("failed to write control file");
|
||||
fs::write(&control_path, &data)
|
||||
.await
|
||||
.expect("failed to write control file");
|
||||
|
||||
match load_from_control_file(&conf, &ttid) {
|
||||
match load_from_control_file(&conf, &ttid).await {
|
||||
Err(err) => assert!(err
|
||||
.to_string()
|
||||
.contains("safekeeper control file checksum mismatch")),
|
||||
|
||||
@@ -121,7 +121,7 @@ pub struct FileInfo {
|
||||
}
|
||||
|
||||
/// Build debug dump response, using the provided [`Args`] filters.
|
||||
pub fn build(args: Args) -> Result<Response> {
|
||||
pub async fn build(args: Args) -> Result<Response> {
|
||||
let start_time = Utc::now();
|
||||
let timelines_count = GlobalTimelines::timelines_count();
|
||||
|
||||
@@ -155,7 +155,7 @@ pub fn build(args: Args) -> Result<Response> {
|
||||
}
|
||||
|
||||
let control_file = if args.dump_control_file {
|
||||
let mut state = tli.get_state().1;
|
||||
let mut state = tli.get_state().await.1;
|
||||
if !args.dump_term_history {
|
||||
state.acceptor_state.term_history = TermHistory(vec![]);
|
||||
}
|
||||
@@ -165,7 +165,7 @@ pub fn build(args: Args) -> Result<Response> {
|
||||
};
|
||||
|
||||
let memory = if args.dump_memory {
|
||||
Some(tli.memory_dump())
|
||||
Some(tli.memory_dump().await)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
@@ -256,14 +256,14 @@ impl SafekeeperPostgresHandler {
|
||||
|
||||
let lsn = if self.is_walproposer_recovery() {
|
||||
// walproposer should get all local WAL until flush_lsn
|
||||
tli.get_flush_lsn()
|
||||
tli.get_flush_lsn().await
|
||||
} else {
|
||||
// other clients shouldn't get any uncommitted WAL
|
||||
tli.get_state().0.commit_lsn
|
||||
tli.get_state().await.0.commit_lsn
|
||||
}
|
||||
.to_string();
|
||||
|
||||
let sysid = tli.get_state().1.server.system_id.to_string();
|
||||
let sysid = tli.get_state().await.1.server.system_id.to_string();
|
||||
let lsn_bytes = lsn.as_bytes();
|
||||
let tli = PG_TLI.to_string();
|
||||
let tli_bytes = tli.as_bytes();
|
||||
|
||||
@@ -2,3 +2,18 @@ pub mod routes;
|
||||
pub use routes::make_router;
|
||||
|
||||
pub use safekeeper_api::models;
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
pub async fn task_main(
|
||||
conf: SafeKeeperConf,
|
||||
http_listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
let router = make_router(conf)
|
||||
.build()
|
||||
.map_err(|err| anyhow::anyhow!(err))?;
|
||||
let service = utils::http::RouterService::new(router).unwrap();
|
||||
let server = hyper::Server::from_tcp(http_listener)?;
|
||||
server.serve(service).await?;
|
||||
Ok(()) // unreachable
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
|
||||
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tokio::task::JoinError;
|
||||
use utils::http::endpoint::request_span;
|
||||
|
||||
use crate::safekeeper::ServerInfo;
|
||||
use crate::safekeeper::Term;
|
||||
@@ -116,8 +116,8 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
let (inmem, state) = tli.get_state();
|
||||
let flush_lsn = tli.get_flush_lsn();
|
||||
let (inmem, state) = tli.get_state().await;
|
||||
let flush_lsn = tli.get_flush_lsn().await;
|
||||
|
||||
let epoch = state.acceptor_state.get_epoch(flush_lsn);
|
||||
let term_history = state
|
||||
@@ -232,13 +232,11 @@ async fn timeline_delete_force_handler(
|
||||
);
|
||||
check_permission(&request, Some(ttid.tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
let resp = tokio::task::spawn_blocking(move || {
|
||||
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
|
||||
// error handling here when we're able to.
|
||||
GlobalTimelines::delete_force(&ttid).map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
// FIXME: `delete_force` can fail from both internal errors and bad requests. Add better
|
||||
// error handling here when we're able to.
|
||||
let resp = GlobalTimelines::delete_force(&ttid)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, resp)
|
||||
}
|
||||
|
||||
@@ -250,14 +248,11 @@ async fn tenant_delete_force_handler(
|
||||
let tenant_id = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
ensure_no_body(&mut request).await?;
|
||||
let delete_info = tokio::task::spawn_blocking(move || {
|
||||
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
|
||||
// Using an `InternalServerError` should be fixed when the types support it
|
||||
GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
// FIXME: `delete_force_all_for_tenant` can return an error for multiple different reasons;
|
||||
// Using an `InternalServerError` should be fixed when the types support it
|
||||
let delete_info = GlobalTimelines::delete_force_all_for_tenant(&tenant_id)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
delete_info
|
||||
@@ -353,11 +348,9 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
|
||||
timeline_id,
|
||||
};
|
||||
|
||||
let resp = tokio::task::spawn_blocking(move || {
|
||||
debug_dump::build(args).map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
let resp = debug_dump::build(args)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
// TODO: use streaming response
|
||||
json_response(StatusCode::OK, resp)
|
||||
@@ -386,29 +379,32 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
|
||||
router
|
||||
.data(Arc::new(conf))
|
||||
.data(auth)
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/status", |r| request_span(r, status_handler))
|
||||
// Will be used in the future instead of implicit timeline creation
|
||||
.post("/v1/tenant/timeline", timeline_create_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_status_handler,
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_delete_force_handler,
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_id", tenant_delete_force_handler)
|
||||
.post("/v1/pull_timeline", timeline_pull_handler)
|
||||
.post("/v1/tenant/timeline", |r| {
|
||||
request_span(r, timeline_create_handler)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
request_span(r, timeline_status_handler)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
request_span(r, timeline_delete_force_handler)
|
||||
})
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
request_span(r, tenant_delete_force_handler)
|
||||
})
|
||||
.post("/v1/pull_timeline", |r| {
|
||||
request_span(r, timeline_pull_handler)
|
||||
})
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
|
||||
timeline_files_handler,
|
||||
|r| request_span(r, timeline_files_handler),
|
||||
)
|
||||
// for tests
|
||||
.post(
|
||||
"/v1/record_safekeeper_info/:tenant_id/:timeline_id",
|
||||
record_safekeeper_info,
|
||||
)
|
||||
.get("/v1/debug_dump", dump_debug_handler)
|
||||
.post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
|
||||
request_span(r, record_safekeeper_info)
|
||||
})
|
||||
.get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -73,12 +73,12 @@ pub async fn handle_json_ctrl<IO: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
// if send_proposer_elected is true, we need to update local history
|
||||
if append_request.send_proposer_elected {
|
||||
send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn)?;
|
||||
send_proposer_elected(&tli, append_request.term, append_request.epoch_start_lsn).await?;
|
||||
}
|
||||
|
||||
let inserted_wal = append_logical_message(&tli, append_request)?;
|
||||
let inserted_wal = append_logical_message(&tli, append_request).await?;
|
||||
let response = AppendResult {
|
||||
state: tli.get_state().1,
|
||||
state: tli.get_state().await.1,
|
||||
inserted_wal,
|
||||
};
|
||||
let response_data = serde_json::to_vec(&response)
|
||||
@@ -114,9 +114,9 @@ async fn prepare_safekeeper(
|
||||
.await
|
||||
}
|
||||
|
||||
fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
|
||||
async fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::Result<()> {
|
||||
// add new term to existing history
|
||||
let history = tli.get_state().1.acceptor_state.term_history;
|
||||
let history = tli.get_state().await.1.acceptor_state.term_history;
|
||||
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
|
||||
let mut history_entries = history.0;
|
||||
history_entries.push(TermSwitchEntry { term, lsn });
|
||||
@@ -129,7 +129,7 @@ fn send_proposer_elected(tli: &Arc<Timeline>, term: Term, lsn: Lsn) -> anyhow::R
|
||||
timeline_start_lsn: lsn,
|
||||
});
|
||||
|
||||
tli.process_msg(&proposer_elected_request)?;
|
||||
tli.process_msg(&proposer_elected_request).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -142,12 +142,12 @@ pub struct InsertedWAL {
|
||||
|
||||
/// Extend local WAL with new LogicalMessage record. To do that,
|
||||
/// create AppendRequest with new WAL and pass it to safekeeper.
|
||||
pub fn append_logical_message(
|
||||
pub async fn append_logical_message(
|
||||
tli: &Arc<Timeline>,
|
||||
msg: &AppendLogicalMessage,
|
||||
) -> anyhow::Result<InsertedWAL> {
|
||||
let wal_data = encode_logical_message(&msg.lm_prefix, &msg.lm_message);
|
||||
let sk_state = tli.get_state().1;
|
||||
let sk_state = tli.get_state().await.1;
|
||||
|
||||
let begin_lsn = msg.begin_lsn;
|
||||
let end_lsn = begin_lsn + wal_data.len() as u64;
|
||||
@@ -171,7 +171,7 @@ pub fn append_logical_message(
|
||||
wal_data: Bytes::from(wal_data),
|
||||
});
|
||||
|
||||
let response = tli.process_msg(&append_request)?;
|
||||
let response = tli.process_msg(&append_request).await?;
|
||||
|
||||
let append_response = match response {
|
||||
Some(AcceptorProposerMessage::AppendResponse(resp)) => resp,
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use tokio::runtime::Runtime;
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
@@ -36,7 +38,6 @@ pub mod defaults {
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
|
||||
pub const DEFAULT_WAL_BACKUP_RUNTIME_THREADS: usize = 8;
|
||||
pub const DEFAULT_HEARTBEAT_TIMEOUT: &str = "5000ms";
|
||||
pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
|
||||
}
|
||||
@@ -60,10 +61,10 @@ pub struct SafeKeeperConf {
|
||||
pub heartbeat_timeout: Duration,
|
||||
pub remote_storage: Option<RemoteStorageConfig>,
|
||||
pub max_offloader_lag_bytes: u64,
|
||||
pub backup_runtime_threads: Option<usize>,
|
||||
pub backup_parallel_jobs: usize,
|
||||
pub wal_backup_enabled: bool,
|
||||
pub auth: Option<Arc<JwtAuth>>,
|
||||
pub current_thread_runtime: bool,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
@@ -92,12 +93,64 @@ impl SafeKeeperConf {
|
||||
.parse()
|
||||
.expect("failed to parse default broker endpoint"),
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
backup_runtime_threads: None,
|
||||
wal_backup_enabled: true,
|
||||
backup_parallel_jobs: 1,
|
||||
auth: None,
|
||||
heartbeat_timeout: Duration::new(5, 0),
|
||||
max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES,
|
||||
current_thread_runtime: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tokio runtimes.
|
||||
pub static WAL_SERVICE_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("WAL service worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create WAL service runtime")
|
||||
});
|
||||
|
||||
pub static HTTP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("HTTP worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create WAL service runtime")
|
||||
});
|
||||
|
||||
pub static BROKER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("broker worker")
|
||||
.worker_threads(2) // there are only 2 tasks, having more threads doesn't make sense
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_REMOVER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("WAL remover")
|
||||
.worker_threads(1)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("WAL backup worker")
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create WAL backup runtime")
|
||||
});
|
||||
|
||||
pub static METRICS_SHIFTER_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
|
||||
tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("metric shifter")
|
||||
.worker_threads(1)
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("Failed to create broker runtime")
|
||||
});
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::{
|
||||
|
||||
use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
|
||||
use anyhow::Result;
|
||||
use futures::Future;
|
||||
use metrics::{
|
||||
core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
|
||||
proto::MetricFamily,
|
||||
@@ -292,14 +293,17 @@ impl WalStorageMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
/// Accepts a closure that returns a result, and returns the duration of the closure.
|
||||
pub fn time_io_closure(closure: impl FnOnce() -> Result<()>) -> Result<f64> {
|
||||
/// Accepts async function that returns empty anyhow result, and returns the duration of its execution.
|
||||
pub async fn time_io_closure<E: Into<anyhow::Error>>(
|
||||
closure: impl Future<Output = Result<(), E>>,
|
||||
) -> Result<f64> {
|
||||
let start = std::time::Instant::now();
|
||||
closure()?;
|
||||
closure.await.map_err(|e| e.into())?;
|
||||
Ok(start.elapsed().as_secs_f64())
|
||||
}
|
||||
|
||||
/// Metrics for a single timeline.
|
||||
#[derive(Clone)]
|
||||
pub struct FullTimelineInfo {
|
||||
pub ttid: TenantTimelineId,
|
||||
pub ps_feedback: PageserverFeedback,
|
||||
@@ -575,13 +579,19 @@ impl Collector for TimelineCollector {
|
||||
let timelines = GlobalTimelines::get_all();
|
||||
let timelines_count = timelines.len();
|
||||
|
||||
for arc_tli in timelines {
|
||||
let tli = arc_tli.info_for_metrics();
|
||||
if tli.is_none() {
|
||||
continue;
|
||||
}
|
||||
let tli = tli.unwrap();
|
||||
// Prometheus Collector is sync, and data is stored under async lock. To
|
||||
// bridge the gap with a crutch, collect data in spawned thread with
|
||||
// local tokio runtime.
|
||||
let infos = std::thread::spawn(|| {
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.build()
|
||||
.expect("failed to create rt");
|
||||
rt.block_on(collect_timeline_metrics())
|
||||
})
|
||||
.join()
|
||||
.expect("collect_timeline_metrics thread panicked");
|
||||
|
||||
for tli in &infos {
|
||||
let tenant_id = tli.ttid.tenant_id.to_string();
|
||||
let timeline_id = tli.ttid.timeline_id.to_string();
|
||||
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
|
||||
@@ -682,3 +692,15 @@ impl Collector for TimelineCollector {
|
||||
mfs
|
||||
}
|
||||
}
|
||||
|
||||
async fn collect_timeline_metrics() -> Vec<FullTimelineInfo> {
|
||||
let mut res = vec![];
|
||||
let timelines = GlobalTimelines::get_all();
|
||||
|
||||
for tli in timelines {
|
||||
if let Some(info) = tli.info_for_metrics().await {
|
||||
res.push(info);
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
@@ -231,7 +231,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
|
||||
info!(
|
||||
"Loaded timeline {}, flush_lsn={}",
|
||||
ttid,
|
||||
tli.get_flush_lsn()
|
||||
tli.get_flush_lsn().await
|
||||
);
|
||||
|
||||
Ok(Response {
|
||||
|
||||
@@ -18,15 +18,14 @@ use postgres_backend::QueryError;
|
||||
use pq_proto::BeMessage;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::io::AsyncWrite;
|
||||
use tokio::sync::mpsc::channel;
|
||||
use tokio::sync::mpsc::error::TryRecvError;
|
||||
use tokio::sync::mpsc::Receiver;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
use tokio::task::spawn_blocking;
|
||||
use tokio::task;
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::Duration;
|
||||
use tokio::time::Instant;
|
||||
use tracing::*;
|
||||
@@ -97,7 +96,7 @@ impl SafekeeperPostgresHandler {
|
||||
Err(res.expect_err("no error with WalAcceptor not spawn"))
|
||||
}
|
||||
Some(handle) => {
|
||||
let wal_acceptor_res = handle.join();
|
||||
let wal_acceptor_res = handle.await;
|
||||
|
||||
// If there was any network error, return it.
|
||||
res?;
|
||||
@@ -107,7 +106,7 @@ impl SafekeeperPostgresHandler {
|
||||
Ok(Ok(_)) => Ok(()), // can't happen currently; would be if we add graceful termination
|
||||
Ok(Err(e)) => Err(CopyStreamHandlerEnd::Other(e.context("WAL acceptor"))),
|
||||
Err(_) => Err(CopyStreamHandlerEnd::Other(anyhow!(
|
||||
"WalAcceptor thread panicked",
|
||||
"WalAcceptor task panicked",
|
||||
))),
|
||||
}
|
||||
}
|
||||
@@ -154,10 +153,12 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
|
||||
}
|
||||
};
|
||||
|
||||
*self.acceptor_handle = Some(
|
||||
WalAcceptor::spawn(tli.clone(), msg_rx, reply_tx, self.conn_id)
|
||||
.context("spawn WalAcceptor thread")?,
|
||||
);
|
||||
*self.acceptor_handle = Some(WalAcceptor::spawn(
|
||||
tli.clone(),
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
self.conn_id,
|
||||
));
|
||||
|
||||
// Forward all messages to WalAcceptor
|
||||
read_network_loop(self.pgb_reader, msg_tx, next_msg).await
|
||||
@@ -226,28 +227,19 @@ impl WalAcceptor {
|
||||
msg_rx: Receiver<ProposerAcceptorMessage>,
|
||||
reply_tx: Sender<AcceptorProposerMessage>,
|
||||
conn_id: ConnectionId,
|
||||
) -> anyhow::Result<JoinHandle<anyhow::Result<()>>> {
|
||||
let thread_name = format!("WAL acceptor {}", tli.ttid);
|
||||
thread::Builder::new()
|
||||
.name(thread_name)
|
||||
.spawn(move || -> anyhow::Result<()> {
|
||||
let mut wa = WalAcceptor {
|
||||
tli,
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
};
|
||||
) -> JoinHandle<anyhow::Result<()>> {
|
||||
task::spawn(async move {
|
||||
let mut wa = WalAcceptor {
|
||||
tli,
|
||||
msg_rx,
|
||||
reply_tx,
|
||||
};
|
||||
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let span_ttid = wa.tli.ttid; // satisfy borrow checker
|
||||
runtime.block_on(
|
||||
wa.run()
|
||||
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid)),
|
||||
)
|
||||
})
|
||||
.map_err(anyhow::Error::from)
|
||||
let span_ttid = wa.tli.ttid; // satisfy borrow checker
|
||||
wa.run()
|
||||
.instrument(info_span!("WAL acceptor", cid = %conn_id, ttid = %span_ttid))
|
||||
.await
|
||||
})
|
||||
}
|
||||
|
||||
/// The main loop. Returns Ok(()) if either msg_rx or reply_tx got closed;
|
||||
@@ -281,7 +273,7 @@ impl WalAcceptor {
|
||||
while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
|
||||
let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
|
||||
|
||||
if let Some(reply) = self.tli.process_msg(&noflush_msg)? {
|
||||
if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
|
||||
if self.reply_tx.send(reply).await.is_err() {
|
||||
return Ok(()); // chan closed, streaming terminated
|
||||
}
|
||||
@@ -300,10 +292,12 @@ impl WalAcceptor {
|
||||
}
|
||||
|
||||
// flush all written WAL to the disk
|
||||
self.tli.process_msg(&ProposerAcceptorMessage::FlushWAL)?
|
||||
self.tli
|
||||
.process_msg(&ProposerAcceptorMessage::FlushWAL)
|
||||
.await?
|
||||
} else {
|
||||
// process message other than AppendRequest
|
||||
self.tli.process_msg(&next_msg)?
|
||||
self.tli.process_msg(&next_msg).await?
|
||||
};
|
||||
|
||||
if let Some(reply) = reply_msg {
|
||||
@@ -326,8 +320,8 @@ impl Drop for ComputeConnectionGuard {
|
||||
let tli = self.timeline.clone();
|
||||
// tokio forbids to call blocking_send inside the runtime, and see
|
||||
// comments in on_compute_disconnect why we call blocking_send.
|
||||
spawn_blocking(move || {
|
||||
if let Err(e) = tli.on_compute_disconnect() {
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = tli.on_compute_disconnect().await {
|
||||
error!("failed to unregister compute connection: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1,29 +1,36 @@
|
||||
//! Thread removing old WAL.
|
||||
|
||||
use std::{thread, time::Duration};
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::sleep;
|
||||
use tracing::*;
|
||||
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
|
||||
pub fn thread_main(conf: SafeKeeperConf) {
|
||||
pub async fn task_main(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let wal_removal_interval = Duration::from_millis(5000);
|
||||
loop {
|
||||
let tlis = GlobalTimelines::get_all();
|
||||
for tli in &tlis {
|
||||
if !tli.is_active() {
|
||||
if !tli.is_active().await {
|
||||
continue;
|
||||
}
|
||||
let ttid = tli.ttid;
|
||||
let _enter =
|
||||
info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered();
|
||||
if let Err(e) = tli.maybe_pesist_control_file() {
|
||||
if let Err(e) = tli
|
||||
.maybe_persist_control_file()
|
||||
.instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
|
||||
.await
|
||||
{
|
||||
warn!("failed to persist control file: {e}");
|
||||
}
|
||||
if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) {
|
||||
warn!("failed to remove WAL: {}", e);
|
||||
if let Err(e) = tli
|
||||
.remove_old_wal(conf.wal_backup_enabled)
|
||||
.instrument(info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id))
|
||||
.await
|
||||
{
|
||||
error!("failed to remove WAL: {}", e);
|
||||
}
|
||||
}
|
||||
thread::sleep(wal_removal_interval)
|
||||
sleep(wal_removal_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -568,25 +568,27 @@ where
|
||||
|
||||
/// Process message from proposer and possibly form reply. Concurrent
|
||||
/// callers must exclude each other.
|
||||
pub fn process_msg(
|
||||
pub async fn process_msg(
|
||||
&mut self,
|
||||
msg: &ProposerAcceptorMessage,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
match msg {
|
||||
ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg),
|
||||
ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg),
|
||||
ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg),
|
||||
ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg, true),
|
||||
ProposerAcceptorMessage::NoFlushAppendRequest(msg) => {
|
||||
self.handle_append_request(msg, false)
|
||||
ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
|
||||
ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
|
||||
ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
|
||||
ProposerAcceptorMessage::AppendRequest(msg) => {
|
||||
self.handle_append_request(msg, true).await
|
||||
}
|
||||
ProposerAcceptorMessage::FlushWAL => self.handle_flush(),
|
||||
ProposerAcceptorMessage::NoFlushAppendRequest(msg) => {
|
||||
self.handle_append_request(msg, false).await
|
||||
}
|
||||
ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle initial message from proposer: check its sanity and send my
|
||||
/// current term.
|
||||
fn handle_greeting(
|
||||
async fn handle_greeting(
|
||||
&mut self,
|
||||
msg: &ProposerGreeting,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
@@ -649,7 +651,7 @@ where
|
||||
if msg.pg_version != UNKNOWN_SERVER_VERSION {
|
||||
state.server.pg_version = msg.pg_version;
|
||||
}
|
||||
self.state.persist(&state)?;
|
||||
self.state.persist(&state).await?;
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -664,7 +666,7 @@ where
|
||||
}
|
||||
|
||||
/// Give vote for the given term, if we haven't done that previously.
|
||||
fn handle_vote_request(
|
||||
async fn handle_vote_request(
|
||||
&mut self,
|
||||
msg: &VoteRequest,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
@@ -678,7 +680,7 @@ where
|
||||
// handle_elected instead. Currently not a big deal, as proposer is the
|
||||
// only source of WAL; with peer2peer recovery it would be more
|
||||
// important.
|
||||
self.wal_store.flush_wal()?;
|
||||
self.wal_store.flush_wal().await?;
|
||||
// initialize with refusal
|
||||
let mut resp = VoteResponse {
|
||||
term: self.state.acceptor_state.term,
|
||||
@@ -692,7 +694,7 @@ where
|
||||
let mut state = self.state.clone();
|
||||
state.acceptor_state.term = msg.term;
|
||||
// persist vote before sending it out
|
||||
self.state.persist(&state)?;
|
||||
self.state.persist(&state).await?;
|
||||
|
||||
resp.term = self.state.acceptor_state.term;
|
||||
resp.vote_given = true as u64;
|
||||
@@ -715,12 +717,15 @@ where
|
||||
ar
|
||||
}
|
||||
|
||||
fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
|
||||
async fn handle_elected(
|
||||
&mut self,
|
||||
msg: &ProposerElected,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
info!("received ProposerElected {:?}", msg);
|
||||
if self.state.acceptor_state.term < msg.term {
|
||||
let mut state = self.state.clone();
|
||||
state.acceptor_state.term = msg.term;
|
||||
self.state.persist(&state)?;
|
||||
self.state.persist(&state).await?;
|
||||
}
|
||||
|
||||
// If our term is higher, ignore the message (next feedback will inform the compute)
|
||||
@@ -750,7 +755,7 @@ where
|
||||
// intersection of our history and history from msg
|
||||
|
||||
// truncate wal, update the LSNs
|
||||
self.wal_store.truncate_wal(msg.start_streaming_at)?;
|
||||
self.wal_store.truncate_wal(msg.start_streaming_at).await?;
|
||||
|
||||
// and now adopt term history from proposer
|
||||
{
|
||||
@@ -784,7 +789,7 @@ where
|
||||
self.inmem.backup_lsn = max(self.inmem.backup_lsn, state.timeline_start_lsn);
|
||||
|
||||
state.acceptor_state.term_history = msg.term_history.clone();
|
||||
self.persist_control_file(state)?;
|
||||
self.persist_control_file(state).await?;
|
||||
}
|
||||
|
||||
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||
@@ -796,7 +801,7 @@ where
|
||||
///
|
||||
/// Note: it is assumed that 'WAL we have is from the right term' check has
|
||||
/// already been done outside.
|
||||
fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
|
||||
async fn update_commit_lsn(&mut self, mut candidate: Lsn) -> Result<()> {
|
||||
// Both peers and walproposer communicate this value, we might already
|
||||
// have a fresher (higher) version.
|
||||
candidate = max(candidate, self.inmem.commit_lsn);
|
||||
@@ -818,29 +823,32 @@ where
|
||||
// that we receive new epoch_start_lsn, and we still need to sync
|
||||
// control file in this case.
|
||||
if commit_lsn == self.epoch_start_lsn && self.state.commit_lsn != commit_lsn {
|
||||
self.persist_control_file(self.state.clone())?;
|
||||
self.persist_control_file(self.state.clone()).await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Persist control file to disk, called only after timeline creation (bootstrap).
|
||||
pub fn persist(&mut self) -> Result<()> {
|
||||
self.persist_control_file(self.state.clone())
|
||||
pub async fn persist(&mut self) -> Result<()> {
|
||||
self.persist_control_file(self.state.clone()).await
|
||||
}
|
||||
|
||||
/// Persist in-memory state to the disk, taking other data from state.
|
||||
fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
|
||||
async fn persist_control_file(&mut self, mut state: SafeKeeperState) -> Result<()> {
|
||||
state.commit_lsn = self.inmem.commit_lsn;
|
||||
state.backup_lsn = self.inmem.backup_lsn;
|
||||
state.peer_horizon_lsn = self.inmem.peer_horizon_lsn;
|
||||
state.proposer_uuid = self.inmem.proposer_uuid;
|
||||
self.state.persist(&state)
|
||||
self.state.persist(&state).await
|
||||
}
|
||||
|
||||
/// Persist control file if there is something to save and enough time
|
||||
/// passed after the last save.
|
||||
pub fn maybe_persist_control_file(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
|
||||
pub async fn maybe_persist_control_file(
|
||||
&mut self,
|
||||
inmem_remote_consistent_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
|
||||
if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
|
||||
return Ok(());
|
||||
@@ -852,7 +860,7 @@ where
|
||||
if need_persist {
|
||||
let mut state = self.state.clone();
|
||||
state.remote_consistent_lsn = inmem_remote_consistent_lsn;
|
||||
self.persist_control_file(state)?;
|
||||
self.persist_control_file(state).await?;
|
||||
trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
|
||||
}
|
||||
Ok(())
|
||||
@@ -860,7 +868,7 @@ where
|
||||
|
||||
/// Handle request to append WAL.
|
||||
#[allow(clippy::comparison_chain)]
|
||||
fn handle_append_request(
|
||||
async fn handle_append_request(
|
||||
&mut self,
|
||||
msg: &AppendRequest,
|
||||
require_flush: bool,
|
||||
@@ -883,17 +891,19 @@ where
|
||||
|
||||
// do the job
|
||||
if !msg.wal_data.is_empty() {
|
||||
self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?;
|
||||
self.wal_store
|
||||
.write_wal(msg.h.begin_lsn, &msg.wal_data)
|
||||
.await?;
|
||||
}
|
||||
|
||||
// flush wal to the disk, if required
|
||||
if require_flush {
|
||||
self.wal_store.flush_wal()?;
|
||||
self.wal_store.flush_wal().await?;
|
||||
}
|
||||
|
||||
// Update commit_lsn.
|
||||
if msg.h.commit_lsn != Lsn(0) {
|
||||
self.update_commit_lsn(msg.h.commit_lsn)?;
|
||||
self.update_commit_lsn(msg.h.commit_lsn).await?;
|
||||
}
|
||||
// Value calculated by walproposer can always lag:
|
||||
// - safekeepers can forget inmem value and send to proposer lower
|
||||
@@ -909,7 +919,7 @@ where
|
||||
if self.state.peer_horizon_lsn + (self.state.server.wal_seg_size as u64)
|
||||
< self.inmem.peer_horizon_lsn
|
||||
{
|
||||
self.persist_control_file(self.state.clone())?;
|
||||
self.persist_control_file(self.state.clone()).await?;
|
||||
}
|
||||
|
||||
trace!(
|
||||
@@ -931,15 +941,15 @@ where
|
||||
}
|
||||
|
||||
/// Flush WAL to disk. Return AppendResponse with latest LSNs.
|
||||
fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
|
||||
self.wal_store.flush_wal()?;
|
||||
async fn handle_flush(&mut self) -> Result<Option<AcceptorProposerMessage>> {
|
||||
self.wal_store.flush_wal().await?;
|
||||
Ok(Some(AcceptorProposerMessage::AppendResponse(
|
||||
self.append_response(),
|
||||
)))
|
||||
}
|
||||
|
||||
/// Update timeline state with peer safekeeper data.
|
||||
pub fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
|
||||
pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
|
||||
let mut sync_control_file = false;
|
||||
|
||||
if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
|
||||
@@ -947,7 +957,7 @@ where
|
||||
// commit_lsn if our history matches (is part of) history of advanced
|
||||
// commit_lsn provider.
|
||||
if sk_info.last_log_term == self.get_epoch() {
|
||||
self.update_commit_lsn(Lsn(sk_info.commit_lsn))?;
|
||||
self.update_commit_lsn(Lsn(sk_info.commit_lsn)).await?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -973,7 +983,7 @@ where
|
||||
// Note: we could make remote_consistent_lsn update in cf common by
|
||||
// storing Arc to walsenders in Safekeeper.
|
||||
state.remote_consistent_lsn = new_remote_consistent_lsn;
|
||||
self.persist_control_file(state)?;
|
||||
self.persist_control_file(state).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -997,6 +1007,7 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use futures::future::BoxFuture;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
|
||||
use super::*;
|
||||
@@ -1008,8 +1019,9 @@ mod tests {
|
||||
persisted_state: SafeKeeperState,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl control_file::Storage for InMemoryState {
|
||||
fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
||||
async fn persist(&mut self, s: &SafeKeeperState) -> Result<()> {
|
||||
self.persisted_state = s.clone();
|
||||
Ok(())
|
||||
}
|
||||
@@ -1039,27 +1051,28 @@ mod tests {
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl wal_storage::Storage for DummyWalStore {
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
self.lsn = startpos + buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
||||
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
||||
self.lsn = end_pos;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_wal(&mut self) -> Result<()> {
|
||||
async fn flush_wal(&mut self) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
|
||||
Box::new(move |_segno_up_to: XLogSegNo| Ok(()))
|
||||
fn remove_up_to(&self, _segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
|
||||
Box::pin(async { Ok(()) })
|
||||
}
|
||||
|
||||
fn get_metrics(&self) -> crate::metrics::WalStorageMetrics {
|
||||
@@ -1067,8 +1080,8 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_voting() {
|
||||
#[tokio::test]
|
||||
async fn test_voting() {
|
||||
let storage = InMemoryState {
|
||||
persisted_state: test_sk_state(),
|
||||
};
|
||||
@@ -1077,7 +1090,7 @@ mod tests {
|
||||
|
||||
// check voting for 1 is ok
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
|
||||
let mut vote_resp = sk.process_msg(&vote_request);
|
||||
let mut vote_resp = sk.process_msg(&vote_request).await;
|
||||
match vote_resp.unwrap() {
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
@@ -1092,15 +1105,15 @@ mod tests {
|
||||
sk = SafeKeeper::new(storage, sk.wal_store, NodeId(0)).unwrap();
|
||||
|
||||
// and ensure voting second time for 1 is not ok
|
||||
vote_resp = sk.process_msg(&vote_request);
|
||||
vote_resp = sk.process_msg(&vote_request).await;
|
||||
match vote_resp.unwrap() {
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_epoch_switch() {
|
||||
#[tokio::test]
|
||||
async fn test_epoch_switch() {
|
||||
let storage = InMemoryState {
|
||||
persisted_state: test_sk_state(),
|
||||
};
|
||||
@@ -1132,10 +1145,13 @@ mod tests {
|
||||
timeline_start_lsn: Lsn(0),
|
||||
};
|
||||
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// check that AppendRequest before epochStartLsn doesn't switch epoch
|
||||
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
|
||||
let resp = sk
|
||||
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await;
|
||||
assert!(resp.is_ok());
|
||||
assert_eq!(sk.get_epoch(), 0);
|
||||
|
||||
@@ -1146,9 +1162,11 @@ mod tests {
|
||||
h: ar_hdr,
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
|
||||
let resp = sk
|
||||
.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request))
|
||||
.await;
|
||||
assert!(resp.is_ok());
|
||||
sk.wal_store.truncate_wal(Lsn(3)).unwrap(); // imitate the complete record at 3 %)
|
||||
sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %)
|
||||
assert_eq!(sk.get_epoch(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,7 +396,7 @@ impl SafekeeperPostgresHandler {
|
||||
// on this safekeeper itself. That's ok as (old) proposer will never be
|
||||
// able to commit such WAL.
|
||||
let stop_pos: Option<Lsn> = if self.is_walproposer_recovery() {
|
||||
let wal_end = tli.get_flush_lsn();
|
||||
let wal_end = tli.get_flush_lsn().await;
|
||||
Some(wal_end)
|
||||
} else {
|
||||
None
|
||||
@@ -418,7 +418,7 @@ impl SafekeeperPostgresHandler {
|
||||
// switch to copy
|
||||
pgb.write_message(&BeMessage::CopyBothResponse).await?;
|
||||
|
||||
let (_, persisted_state) = tli.get_state();
|
||||
let (_, persisted_state) = tli.get_state().await;
|
||||
let wal_reader = WalReader::new(
|
||||
self.conf.workdir.clone(),
|
||||
self.conf.timeline_dir(&tli.ttid),
|
||||
@@ -562,7 +562,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
|
||||
.walsenders
|
||||
.get_ws_remote_consistent_lsn(self.ws_guard.id)
|
||||
{
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn) {
|
||||
if self.tli.should_walsender_stop(remote_consistent_lsn).await {
|
||||
// Terminate if there is nothing more to send.
|
||||
return Err(CopyStreamHandlerEnd::ServerInitiated(format!(
|
||||
"ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
//! to glue together SafeKeeper and all other background services.
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use parking_lot::{Mutex, MutexGuard};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use tokio::fs;
|
||||
|
||||
use std::cmp::max;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::{Mutex, MutexGuard};
|
||||
use tokio::{
|
||||
sync::{mpsc::Sender, watch},
|
||||
time::Instant,
|
||||
@@ -265,7 +266,7 @@ impl From<TimelineError> for ApiError {
|
||||
fn from(te: TimelineError) -> ApiError {
|
||||
match te {
|
||||
TimelineError::NotFound(ttid) => {
|
||||
ApiError::NotFound(anyhow!("timeline {} not found", ttid))
|
||||
ApiError::NotFound(anyhow!("timeline {} not found", ttid).into())
|
||||
}
|
||||
_ => ApiError::InternalServerError(anyhow!("{}", te)),
|
||||
}
|
||||
@@ -286,8 +287,9 @@ pub struct Timeline {
|
||||
commit_lsn_watch_tx: watch::Sender<Lsn>,
|
||||
commit_lsn_watch_rx: watch::Receiver<Lsn>,
|
||||
|
||||
/// Safekeeper and other state, that should remain consistent and synchronized
|
||||
/// with the disk.
|
||||
/// Safekeeper and other state, that should remain consistent and
|
||||
/// synchronized with the disk. This is tokio mutex as we write WAL to disk
|
||||
/// while holding it, ensuring that consensus checks are in order.
|
||||
mutex: Mutex<SharedState>,
|
||||
walsenders: Arc<WalSenders>,
|
||||
|
||||
@@ -361,8 +363,8 @@ impl Timeline {
|
||||
///
|
||||
/// Bootstrap is transactional, so if it fails, created files will be deleted,
|
||||
/// and state on disk should remain unchanged.
|
||||
pub fn bootstrap(&self, shared_state: &mut MutexGuard<SharedState>) -> Result<()> {
|
||||
match std::fs::metadata(&self.timeline_dir) {
|
||||
pub async fn bootstrap(&self, shared_state: &mut MutexGuard<'_, SharedState>) -> Result<()> {
|
||||
match fs::metadata(&self.timeline_dir).await {
|
||||
Ok(_) => {
|
||||
// Timeline directory exists on disk, we should leave state unchanged
|
||||
// and return error.
|
||||
@@ -375,53 +377,51 @@ impl Timeline {
|
||||
}
|
||||
|
||||
// Create timeline directory.
|
||||
std::fs::create_dir_all(&self.timeline_dir)?;
|
||||
fs::create_dir_all(&self.timeline_dir).await?;
|
||||
|
||||
// Write timeline to disk and TODO: start background tasks.
|
||||
match || -> Result<()> {
|
||||
shared_state.sk.persist()?;
|
||||
// TODO: add more initialization steps here
|
||||
self.update_status(shared_state);
|
||||
Ok(())
|
||||
}() {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => {
|
||||
// Bootstrap failed, cancel timeline and remove timeline directory.
|
||||
self.cancel(shared_state);
|
||||
if let Err(e) = shared_state.sk.persist().await {
|
||||
// Bootstrap failed, cancel timeline and remove timeline directory.
|
||||
self.cancel(shared_state);
|
||||
|
||||
if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) {
|
||||
warn!(
|
||||
"failed to remove timeline {} directory after bootstrap failure: {}",
|
||||
self.ttid, fs_err
|
||||
);
|
||||
}
|
||||
|
||||
Err(e)
|
||||
if let Err(fs_err) = fs::remove_dir_all(&self.timeline_dir).await {
|
||||
warn!(
|
||||
"failed to remove timeline {} directory after bootstrap failure: {}",
|
||||
self.ttid, fs_err
|
||||
);
|
||||
}
|
||||
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
// TODO: add more initialization steps here
|
||||
self.update_status(shared_state);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete timeline from disk completely, by removing timeline directory. Background
|
||||
/// timeline activities will stop eventually.
|
||||
pub fn delete_from_disk(
|
||||
pub async fn delete_from_disk(
|
||||
&self,
|
||||
shared_state: &mut MutexGuard<SharedState>,
|
||||
shared_state: &mut MutexGuard<'_, SharedState>,
|
||||
) -> Result<(bool, bool)> {
|
||||
let was_active = shared_state.active;
|
||||
self.cancel(shared_state);
|
||||
let dir_existed = delete_dir(&self.timeline_dir)?;
|
||||
let dir_existed = delete_dir(&self.timeline_dir).await?;
|
||||
Ok((dir_existed, was_active))
|
||||
}
|
||||
|
||||
/// Cancel timeline to prevent further usage. Background tasks will stop
|
||||
/// eventually after receiving cancellation signal.
|
||||
fn cancel(&self, shared_state: &mut MutexGuard<SharedState>) {
|
||||
///
|
||||
/// Note that we can't notify backup launcher here while holding
|
||||
/// shared_state lock, as this is a potential deadlock: caller is
|
||||
/// responsible for that. Generally we should probably make WAL backup tasks
|
||||
/// to shut down on their own, checking once in a while whether it is the
|
||||
/// time.
|
||||
fn cancel(&self, shared_state: &mut MutexGuard<'_, SharedState>) {
|
||||
info!("timeline {} is cancelled", self.ttid);
|
||||
let _ = self.cancellation_tx.send(true);
|
||||
let res = self.wal_backup_launcher_tx.blocking_send(self.ttid);
|
||||
if let Err(e) = res {
|
||||
error!("Failed to send stop signal to wal_backup_launcher: {}", e);
|
||||
}
|
||||
// Close associated FDs. Nobody will be able to touch timeline data once
|
||||
// it is cancelled, so WAL storage won't be opened again.
|
||||
shared_state.sk.wal_store.close();
|
||||
@@ -433,8 +433,8 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Take a writing mutual exclusive lock on timeline shared_state.
|
||||
pub fn write_shared_state(&self) -> MutexGuard<SharedState> {
|
||||
self.mutex.lock()
|
||||
pub async fn write_shared_state(&self) -> MutexGuard<SharedState> {
|
||||
self.mutex.lock().await
|
||||
}
|
||||
|
||||
fn update_status(&self, shared_state: &mut SharedState) -> bool {
|
||||
@@ -450,7 +450,7 @@ impl Timeline {
|
||||
|
||||
let is_wal_backup_action_pending: bool;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.num_computes += 1;
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
@@ -464,22 +464,17 @@ impl Timeline {
|
||||
|
||||
/// De-register compute connection, shutting down timeline activity if
|
||||
/// pageserver doesn't need catchup.
|
||||
pub fn on_compute_disconnect(&self) -> Result<()> {
|
||||
pub async fn on_compute_disconnect(&self) -> Result<()> {
|
||||
let is_wal_backup_action_pending: bool;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.num_computes -= 1;
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
}
|
||||
// Wake up wal backup launcher, if it is time to stop the offloading.
|
||||
if is_wal_backup_action_pending {
|
||||
// Can fail only if channel to a static thread got closed, which is not normal at all.
|
||||
//
|
||||
// Note: this is blocking_send because on_compute_disconnect is called in Drop, there is
|
||||
// no async Drop and we use current thread runtimes. With current thread rt spawning
|
||||
// task in drop impl is racy, as thread along with runtime might finish before the task.
|
||||
// This should be switched send.await when/if we go to full async.
|
||||
self.wal_backup_launcher_tx.blocking_send(self.ttid)?;
|
||||
self.wal_backup_launcher_tx.send(self.ttid).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -489,11 +484,11 @@ impl Timeline {
|
||||
/// computes. While there might be nothing to stream already, we learn about
|
||||
/// remote_consistent_lsn update through replication feedback, and we want
|
||||
/// to stop pushing to the broker if pageserver is fully caughtup.
|
||||
pub fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
|
||||
pub async fn should_walsender_stop(&self, reported_remote_consistent_lsn: Lsn) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return true;
|
||||
}
|
||||
let shared_state = self.write_shared_state();
|
||||
let shared_state = self.write_shared_state().await;
|
||||
if shared_state.num_computes == 0 {
|
||||
return shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn;
|
||||
@@ -503,12 +498,12 @@ impl Timeline {
|
||||
|
||||
/// Returns whether s3 offloading is required and sets current status as
|
||||
/// matching it.
|
||||
pub fn wal_backup_attend(&self) -> bool {
|
||||
pub async fn wal_backup_attend(&self) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
self.write_shared_state().wal_backup_attend()
|
||||
self.write_shared_state().await.wal_backup_attend()
|
||||
}
|
||||
|
||||
/// Returns commit_lsn watch channel.
|
||||
@@ -517,7 +512,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Pass arrived message to the safekeeper.
|
||||
pub fn process_msg(
|
||||
pub async fn process_msg(
|
||||
&self,
|
||||
msg: &ProposerAcceptorMessage,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
@@ -528,8 +523,8 @@ impl Timeline {
|
||||
let mut rmsg: Option<AcceptorProposerMessage>;
|
||||
let commit_lsn: Lsn;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
rmsg = shared_state.sk.process_msg(msg)?;
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
rmsg = shared_state.sk.process_msg(msg).await?;
|
||||
|
||||
// if this is AppendResponse, fill in proper pageserver and hot
|
||||
// standby feedback.
|
||||
@@ -546,37 +541,37 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Returns wal_seg_size.
|
||||
pub fn get_wal_seg_size(&self) -> usize {
|
||||
self.write_shared_state().get_wal_seg_size()
|
||||
pub async fn get_wal_seg_size(&self) -> usize {
|
||||
self.write_shared_state().await.get_wal_seg_size()
|
||||
}
|
||||
|
||||
/// Returns true only if the timeline is loaded and active.
|
||||
pub fn is_active(&self) -> bool {
|
||||
pub async fn is_active(&self) -> bool {
|
||||
if self.is_cancelled() {
|
||||
return false;
|
||||
}
|
||||
|
||||
self.write_shared_state().active
|
||||
self.write_shared_state().await.active
|
||||
}
|
||||
|
||||
/// Returns state of the timeline.
|
||||
pub fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
|
||||
let state = self.write_shared_state();
|
||||
pub async fn get_state(&self) -> (SafekeeperMemState, SafeKeeperState) {
|
||||
let state = self.write_shared_state().await;
|
||||
(state.sk.inmem.clone(), state.sk.state.clone())
|
||||
}
|
||||
|
||||
/// Returns latest backup_lsn.
|
||||
pub fn get_wal_backup_lsn(&self) -> Lsn {
|
||||
self.write_shared_state().sk.inmem.backup_lsn
|
||||
pub async fn get_wal_backup_lsn(&self) -> Lsn {
|
||||
self.write_shared_state().await.sk.inmem.backup_lsn
|
||||
}
|
||||
|
||||
/// Sets backup_lsn to the given value.
|
||||
pub fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
|
||||
pub async fn set_wal_backup_lsn(&self, backup_lsn: Lsn) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
let mut state = self.write_shared_state();
|
||||
let mut state = self.write_shared_state().await;
|
||||
state.sk.inmem.backup_lsn = max(state.sk.inmem.backup_lsn, backup_lsn);
|
||||
// we should check whether to shut down offloader, but this will be done
|
||||
// soon by peer communication anyway.
|
||||
@@ -584,8 +579,8 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Get safekeeper info for broadcasting to broker and other peers.
|
||||
pub fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
|
||||
let shared_state = self.write_shared_state();
|
||||
pub async fn get_safekeeper_info(&self, conf: &SafeKeeperConf) -> SafekeeperTimelineInfo {
|
||||
let shared_state = self.write_shared_state().await;
|
||||
shared_state.get_safekeeper_info(
|
||||
&self.ttid,
|
||||
conf,
|
||||
@@ -604,8 +599,8 @@ impl Timeline {
|
||||
let is_wal_backup_action_pending: bool;
|
||||
let commit_lsn: Lsn;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
shared_state.sk.record_safekeeper_info(&sk_info)?;
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.sk.record_safekeeper_info(&sk_info).await?;
|
||||
let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
|
||||
shared_state.peers_info.upsert(&peer_info);
|
||||
is_wal_backup_action_pending = self.update_status(&mut shared_state);
|
||||
@@ -622,8 +617,8 @@ impl Timeline {
|
||||
/// Get our latest view of alive peers status on the timeline.
|
||||
/// We pass our own info through the broker as well, so when we don't have connection
|
||||
/// to the broker returned vec is empty.
|
||||
pub fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
|
||||
let shared_state = self.write_shared_state();
|
||||
pub async fn get_peers(&self, conf: &SafeKeeperConf) -> Vec<PeerInfo> {
|
||||
let shared_state = self.write_shared_state().await;
|
||||
let now = Instant::now();
|
||||
shared_state
|
||||
.peers_info
|
||||
@@ -640,34 +635,34 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Returns flush_lsn.
|
||||
pub fn get_flush_lsn(&self) -> Lsn {
|
||||
self.write_shared_state().sk.wal_store.flush_lsn()
|
||||
pub async fn get_flush_lsn(&self) -> Lsn {
|
||||
self.write_shared_state().await.sk.wal_store.flush_lsn()
|
||||
}
|
||||
|
||||
/// Delete WAL segments from disk that are no longer needed. This is determined
|
||||
/// based on pageserver's remote_consistent_lsn and local backup_lsn/peer_lsn.
|
||||
pub fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
|
||||
pub async fn remove_old_wal(&self, wal_backup_enabled: bool) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
let horizon_segno: XLogSegNo;
|
||||
let remover: Box<dyn Fn(u64) -> Result<(), anyhow::Error>>;
|
||||
{
|
||||
let shared_state = self.write_shared_state();
|
||||
let remover = {
|
||||
let shared_state = self.write_shared_state().await;
|
||||
horizon_segno = shared_state.sk.get_horizon_segno(wal_backup_enabled);
|
||||
remover = shared_state.sk.wal_store.remove_up_to();
|
||||
if horizon_segno <= 1 || horizon_segno <= shared_state.last_removed_segno {
|
||||
return Ok(());
|
||||
return Ok(()); // nothing to do
|
||||
}
|
||||
let remover = shared_state.sk.wal_store.remove_up_to(horizon_segno - 1);
|
||||
// release the lock before removing
|
||||
}
|
||||
remover
|
||||
};
|
||||
|
||||
// delete old WAL files
|
||||
remover(horizon_segno - 1)?;
|
||||
remover.await?;
|
||||
|
||||
// update last_removed_segno
|
||||
let mut shared_state = self.write_shared_state();
|
||||
let mut shared_state = self.write_shared_state().await;
|
||||
shared_state.last_removed_segno = horizon_segno;
|
||||
Ok(())
|
||||
}
|
||||
@@ -676,22 +671,24 @@ impl Timeline {
|
||||
/// passed after the last save. This helps to keep remote_consistent_lsn up
|
||||
/// to date so that storage nodes restart doesn't cause many pageserver ->
|
||||
/// safekeeper reconnections.
|
||||
pub fn maybe_pesist_control_file(&self) -> Result<()> {
|
||||
pub async fn maybe_persist_control_file(&self) -> Result<()> {
|
||||
let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn();
|
||||
self.write_shared_state()
|
||||
.await
|
||||
.sk
|
||||
.maybe_persist_control_file(remote_consistent_lsn)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Returns full timeline info, required for the metrics. If the timeline is
|
||||
/// not active, returns None instead.
|
||||
pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
|
||||
/// Gather timeline data for metrics. If the timeline is not active, returns
|
||||
/// None, we do not collect these.
|
||||
pub async fn info_for_metrics(&self) -> Option<FullTimelineInfo> {
|
||||
if self.is_cancelled() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let ps_feedback = self.walsenders.get_ps_feedback();
|
||||
let state = self.write_shared_state();
|
||||
let state = self.write_shared_state().await;
|
||||
if state.active {
|
||||
Some(FullTimelineInfo {
|
||||
ttid: self.ttid,
|
||||
@@ -713,8 +710,8 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Returns in-memory timeline state to build a full debug dump.
|
||||
pub fn memory_dump(&self) -> debug_dump::Memory {
|
||||
let state = self.write_shared_state();
|
||||
pub async fn memory_dump(&self) -> debug_dump::Memory {
|
||||
let state = self.write_shared_state().await;
|
||||
|
||||
let (write_lsn, write_record_lsn, flush_lsn, file_open) =
|
||||
state.sk.wal_store.internal_state();
|
||||
@@ -738,8 +735,8 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Deletes directory and it's contents. Returns false if directory does not exist.
|
||||
fn delete_dir(path: &PathBuf) -> Result<bool> {
|
||||
match std::fs::remove_dir_all(path) {
|
||||
async fn delete_dir(path: &PathBuf) -> Result<bool> {
|
||||
match fs::remove_dir_all(path).await {
|
||||
Ok(_) => Ok(true),
|
||||
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false),
|
||||
Err(e) => Err(e.into()),
|
||||
|
||||
@@ -113,9 +113,17 @@ impl GlobalTimelines {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir errors if any.
|
||||
/// Loads all timelines for the given tenant to memory. Returns fs::read_dir
|
||||
/// errors if any.
|
||||
///
|
||||
/// Note: This function (and all reading/loading below) is sync because
|
||||
/// timelines are loaded while holding GlobalTimelinesState lock. Which is
|
||||
/// fine as this is called only from single threaded main runtime on boot,
|
||||
/// but clippy complains anyway, and suppressing that isn't trivial as async
|
||||
/// is the keyword, ha. That only other user is pull_timeline.rs for which
|
||||
/// being blocked is not that bad, and we can do spawn_blocking.
|
||||
fn load_tenant_timelines(
|
||||
state: &mut MutexGuard<GlobalTimelinesState>,
|
||||
state: &mut MutexGuard<'_, GlobalTimelinesState>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<()> {
|
||||
let timelines_dir = state.get_conf().tenant_dir(&tenant_id);
|
||||
@@ -220,7 +228,7 @@ impl GlobalTimelines {
|
||||
// Take a lock and finish the initialization holding this mutex. No other threads
|
||||
// can interfere with creation after we will insert timeline into the map.
|
||||
{
|
||||
let mut shared_state = timeline.write_shared_state();
|
||||
let mut shared_state = timeline.write_shared_state().await;
|
||||
|
||||
// We can get a race condition here in case of concurrent create calls, but only
|
||||
// in theory. create() will return valid timeline on the next try.
|
||||
@@ -232,7 +240,7 @@ impl GlobalTimelines {
|
||||
// Write the new timeline to the disk and start background workers.
|
||||
// Bootstrap is transactional, so if it fails, the timeline will be deleted,
|
||||
// and the state on disk should remain unchanged.
|
||||
if let Err(e) = timeline.bootstrap(&mut shared_state) {
|
||||
if let Err(e) = timeline.bootstrap(&mut shared_state).await {
|
||||
// Note: the most likely reason for bootstrap failure is that the timeline
|
||||
// directory already exists on disk. This happens when timeline is corrupted
|
||||
// and wasn't loaded from disk on startup because of that. We want to preserve
|
||||
@@ -294,15 +302,16 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Cancels timeline, then deletes the corresponding data directory.
|
||||
pub fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
|
||||
pub async fn delete_force(ttid: &TenantTimelineId) -> Result<TimelineDeleteForceResult> {
|
||||
let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
|
||||
match tli_res {
|
||||
Ok(timeline) => {
|
||||
// Take a lock and finish the deletion holding this mutex.
|
||||
let mut shared_state = timeline.write_shared_state();
|
||||
let mut shared_state = timeline.write_shared_state().await;
|
||||
|
||||
info!("deleting timeline {}", ttid);
|
||||
let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?;
|
||||
let (dir_existed, was_active) =
|
||||
timeline.delete_from_disk(&mut shared_state).await?;
|
||||
|
||||
// Remove timeline from the map.
|
||||
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
|
||||
@@ -335,7 +344,7 @@ impl GlobalTimelines {
|
||||
/// the tenant had, `true` if a timeline was active. There may be a race if new timelines are
|
||||
/// created simultaneously. In that case the function will return error and the caller should
|
||||
/// retry tenant deletion again later.
|
||||
pub fn delete_force_all_for_tenant(
|
||||
pub async fn delete_force_all_for_tenant(
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<HashMap<TenantTimelineId, TimelineDeleteForceResult>> {
|
||||
info!("deleting all timelines for tenant {}", tenant_id);
|
||||
@@ -345,7 +354,7 @@ impl GlobalTimelines {
|
||||
|
||||
let mut deleted = HashMap::new();
|
||||
for tli in &to_delete {
|
||||
match Self::delete_force(&tli.ttid) {
|
||||
match Self::delete_force(&tli.ttid).await {
|
||||
Ok(result) => {
|
||||
deleted.insert(tli.ttid, result);
|
||||
}
|
||||
|
||||
@@ -17,7 +17,6 @@ use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::{XLogSegNo, PG_TLI};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::fs::File;
|
||||
use tokio::runtime::Builder;
|
||||
|
||||
use tokio::select;
|
||||
use tokio::sync::mpsc::{self, Receiver, Sender};
|
||||
@@ -36,30 +35,16 @@ use once_cell::sync::OnceCell;
|
||||
const UPLOAD_FAILURE_RETRY_MIN_MS: u64 = 10;
|
||||
const UPLOAD_FAILURE_RETRY_MAX_MS: u64 = 5000;
|
||||
|
||||
pub fn wal_backup_launcher_thread_main(
|
||||
conf: SafeKeeperConf,
|
||||
wal_backup_launcher_rx: Receiver<TenantTimelineId>,
|
||||
) {
|
||||
let mut builder = Builder::new_multi_thread();
|
||||
if let Some(num_threads) = conf.backup_runtime_threads {
|
||||
builder.worker_threads(num_threads);
|
||||
}
|
||||
let rt = builder
|
||||
.enable_all()
|
||||
.build()
|
||||
.expect("failed to create wal backup runtime");
|
||||
|
||||
rt.block_on(async {
|
||||
wal_backup_launcher_main_loop(conf, wal_backup_launcher_rx).await;
|
||||
});
|
||||
}
|
||||
|
||||
/// Check whether wal backup is required for timeline. If yes, mark that launcher is
|
||||
/// aware of current status and return the timeline.
|
||||
fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
|
||||
GlobalTimelines::get(ttid)
|
||||
.ok()
|
||||
.filter(|tli| tli.wal_backup_attend())
|
||||
async fn is_wal_backup_required(ttid: TenantTimelineId) -> Option<Arc<Timeline>> {
|
||||
match GlobalTimelines::get(ttid).ok() {
|
||||
Some(tli) => {
|
||||
tli.wal_backup_attend().await;
|
||||
Some(tli)
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
struct WalBackupTaskHandle {
|
||||
@@ -143,8 +128,8 @@ async fn update_task(
|
||||
ttid: TenantTimelineId,
|
||||
entry: &mut WalBackupTimelineEntry,
|
||||
) {
|
||||
let alive_peers = entry.timeline.get_peers(conf);
|
||||
let wal_backup_lsn = entry.timeline.get_wal_backup_lsn();
|
||||
let alive_peers = entry.timeline.get_peers(conf).await;
|
||||
let wal_backup_lsn = entry.timeline.get_wal_backup_lsn().await;
|
||||
let (offloader, election_dbg_str) =
|
||||
determine_offloader(&alive_peers, wal_backup_lsn, ttid, conf);
|
||||
let elected_me = Some(conf.my_id) == offloader;
|
||||
@@ -183,10 +168,10 @@ const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000;
|
||||
/// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup
|
||||
/// tasks. Having this in separate task simplifies locking, allows to reap
|
||||
/// panics and separate elections from offloading itself.
|
||||
async fn wal_backup_launcher_main_loop(
|
||||
pub async fn wal_backup_launcher_task_main(
|
||||
conf: SafeKeeperConf,
|
||||
mut wal_backup_launcher_rx: Receiver<TenantTimelineId>,
|
||||
) {
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"WAL backup launcher started, remote config {:?}",
|
||||
conf.remote_storage
|
||||
@@ -214,7 +199,7 @@ async fn wal_backup_launcher_main_loop(
|
||||
if conf.remote_storage.is_none() || !conf.wal_backup_enabled {
|
||||
continue; /* just drain the channel and do nothing */
|
||||
}
|
||||
let timeline = is_wal_backup_required(ttid);
|
||||
let timeline = is_wal_backup_required(ttid).await;
|
||||
// do we need to do anything at all?
|
||||
if timeline.is_some() != tasks.contains_key(&ttid) {
|
||||
if let Some(timeline) = timeline {
|
||||
@@ -269,7 +254,7 @@ async fn backup_task_main(
|
||||
let tli = res.unwrap();
|
||||
|
||||
let mut wb = WalBackupTask {
|
||||
wal_seg_size: tli.get_wal_seg_size(),
|
||||
wal_seg_size: tli.get_wal_seg_size().await,
|
||||
commit_lsn_watch_rx: tli.get_commit_lsn_watch_rx(),
|
||||
timeline: tli,
|
||||
timeline_dir,
|
||||
@@ -326,7 +311,7 @@ impl WalBackupTask {
|
||||
continue; /* nothing to do, common case as we wake up on every commit_lsn bump */
|
||||
}
|
||||
// Perhaps peers advanced the position, check shmem value.
|
||||
backup_lsn = self.timeline.get_wal_backup_lsn();
|
||||
backup_lsn = self.timeline.get_wal_backup_lsn().await;
|
||||
if backup_lsn.segment_number(self.wal_seg_size)
|
||||
>= commit_lsn.segment_number(self.wal_seg_size)
|
||||
{
|
||||
@@ -402,6 +387,7 @@ pub async fn backup_lsn_range(
|
||||
let new_backup_lsn = segment.end_lsn;
|
||||
timeline
|
||||
.set_wal_backup_lsn(new_backup_lsn)
|
||||
.await
|
||||
.context("setting wal_backup_lsn")?;
|
||||
*backup_lsn = new_backup_lsn;
|
||||
} else {
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//!
|
||||
use anyhow::{Context, Result};
|
||||
use postgres_backend::QueryError;
|
||||
use std::{future, thread, time::Duration};
|
||||
use std::{future, time::Duration};
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_io_timeout::TimeoutReader;
|
||||
use tracing::*;
|
||||
@@ -16,104 +16,82 @@ use crate::SafeKeeperConf;
|
||||
use postgres_backend::{AuthType, PostgresBackend};
|
||||
|
||||
/// Accept incoming TCP connections and spawn them into a background thread.
|
||||
pub fn thread_main(conf: SafeKeeperConf, pg_listener: std::net::TcpListener) {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.context("create runtime")
|
||||
// todo catch error in main thread
|
||||
.expect("failed to create runtime");
|
||||
pub async fn task_main(
|
||||
conf: SafeKeeperConf,
|
||||
pg_listener: std::net::TcpListener,
|
||||
) -> anyhow::Result<()> {
|
||||
// Tokio's from_std won't do this for us, per its comment.
|
||||
pg_listener.set_nonblocking(true)?;
|
||||
|
||||
runtime
|
||||
.block_on(async move {
|
||||
// Tokio's from_std won't do this for us, per its comment.
|
||||
pg_listener.set_nonblocking(true)?;
|
||||
let listener = tokio::net::TcpListener::from_std(pg_listener)?;
|
||||
let mut connection_count: ConnectionCount = 0;
|
||||
let listener = tokio::net::TcpListener::from_std(pg_listener)?;
|
||||
let mut connection_count: ConnectionCount = 0;
|
||||
|
||||
loop {
|
||||
match listener.accept().await {
|
||||
Ok((socket, peer_addr)) => {
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
let conn_id = issue_connection_id(&mut connection_count);
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept().await.context("accept")?;
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let conf = conf.clone();
|
||||
let conn_id = issue_connection_id(&mut connection_count);
|
||||
|
||||
let _ = thread::Builder::new()
|
||||
.name("WAL service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = handle_socket(socket, conf, conn_id) {
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
Err(e) => error!("Failed to accept connection: {}", e),
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
if let Err(err) = handle_socket(socket, conf, conn_id)
|
||||
.instrument(info_span!("", cid = %conn_id))
|
||||
.await
|
||||
{
|
||||
error!("connection handler exited: {}", err);
|
||||
}
|
||||
#[allow(unreachable_code)] // hint compiler the closure return type
|
||||
Ok::<(), anyhow::Error>(())
|
||||
})
|
||||
.expect("listener failed")
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// This is run by `thread_main` above, inside a background thread.
|
||||
/// This is run by `task_main` above, inside a background thread.
|
||||
///
|
||||
fn handle_socket(
|
||||
async fn handle_socket(
|
||||
socket: TcpStream,
|
||||
conf: SafeKeeperConf,
|
||||
conn_id: ConnectionId,
|
||||
) -> Result<(), QueryError> {
|
||||
let _enter = info_span!("", cid = %conn_id).entered();
|
||||
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
socket.set_nodelay(true)?;
|
||||
let peer_addr = socket.peer_addr()?;
|
||||
|
||||
// TimeoutReader wants async runtime during creation.
|
||||
runtime.block_on(async move {
|
||||
// Set timeout on reading from the socket. It prevents hanged up connection
|
||||
// if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
|
||||
// default, and tokio doesn't provide ability to set it out of the box.
|
||||
let mut socket = TimeoutReader::new(socket);
|
||||
let wal_service_timeout = Duration::from_secs(60 * 10);
|
||||
socket.set_timeout(Some(wal_service_timeout));
|
||||
// pin! is here because TimeoutReader (due to storing sleep future inside)
|
||||
// is not Unpin, and all pgbackend/framed/tokio dependencies require stream
|
||||
// to be Unpin. Which is reasonable, as indeed something like TimeoutReader
|
||||
// shouldn't be moved.
|
||||
tokio::pin!(socket);
|
||||
// Set timeout on reading from the socket. It prevents hanged up connection
|
||||
// if client suddenly disappears. Note that TCP_KEEPALIVE is not enabled by
|
||||
// default, and tokio doesn't provide ability to set it out of the box.
|
||||
let mut socket = TimeoutReader::new(socket);
|
||||
let wal_service_timeout = Duration::from_secs(60 * 10);
|
||||
socket.set_timeout(Some(wal_service_timeout));
|
||||
// pin! is here because TimeoutReader (due to storing sleep future inside)
|
||||
// is not Unpin, and all pgbackend/framed/tokio dependencies require stream
|
||||
// to be Unpin. Which is reasonable, as indeed something like TimeoutReader
|
||||
// shouldn't be moved.
|
||||
tokio::pin!(socket);
|
||||
|
||||
let traffic_metrics = TrafficMetrics::new();
|
||||
if let Some(current_az) = conf.availability_zone.as_deref() {
|
||||
traffic_metrics.set_sk_az(current_az);
|
||||
}
|
||||
let traffic_metrics = TrafficMetrics::new();
|
||||
if let Some(current_az) = conf.availability_zone.as_deref() {
|
||||
traffic_metrics.set_sk_az(current_az);
|
||||
}
|
||||
|
||||
let socket = MeasuredStream::new(
|
||||
socket,
|
||||
|cnt| {
|
||||
traffic_metrics.observe_read(cnt);
|
||||
},
|
||||
|cnt| {
|
||||
traffic_metrics.observe_write(cnt);
|
||||
},
|
||||
);
|
||||
let socket = MeasuredStream::new(
|
||||
socket,
|
||||
|cnt| {
|
||||
traffic_metrics.observe_read(cnt);
|
||||
},
|
||||
|cnt| {
|
||||
traffic_metrics.observe_write(cnt);
|
||||
},
|
||||
);
|
||||
|
||||
let auth_type = match conf.auth {
|
||||
None => AuthType::Trust,
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
pgbackend
|
||||
.run(&mut conn_handler, future::pending::<()>)
|
||||
.await
|
||||
})
|
||||
let auth_type = match conf.auth {
|
||||
None => AuthType::Trust,
|
||||
Some(_) => AuthType::NeonJWT,
|
||||
};
|
||||
let mut conn_handler =
|
||||
SafekeeperPostgresHandler::new(conf, conn_id, Some(traffic_metrics.clone()));
|
||||
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
|
||||
// libpq protocol between safekeeper and walproposer / pageserver
|
||||
// We don't use shutdown.
|
||||
pgbackend
|
||||
.run(&mut conn_handler, future::pending::<()>)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Unique WAL service connection ids are logged in spans for observability.
|
||||
|
||||
@@ -8,54 +8,47 @@
|
||||
//! Note that last file has `.partial` suffix, that's different from postgres.
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use remote_storage::RemotePath;
|
||||
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::pin::Pin;
|
||||
use tokio::io::AsyncRead;
|
||||
|
||||
use bytes::Bytes;
|
||||
use futures::future::BoxFuture;
|
||||
use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
|
||||
use postgres_ffi::{XLogSegNo, PG_TLI};
|
||||
use remote_storage::RemotePath;
|
||||
use std::cmp::{max, min};
|
||||
|
||||
use bytes::Bytes;
|
||||
use std::fs::{self, remove_file, File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::io::{self, SeekFrom};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use std::pin::Pin;
|
||||
use tokio::fs::{self, remove_file, File, OpenOptions};
|
||||
use tokio::io::{AsyncRead, AsyncWriteExt};
|
||||
use tokio::io::{AsyncReadExt, AsyncSeekExt};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
|
||||
use crate::safekeeper::SafeKeeperState;
|
||||
|
||||
use crate::wal_backup::read_object;
|
||||
use crate::SafeKeeperConf;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::XLOG_BLCKSZ;
|
||||
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
|
||||
use pq_proto::SystemId;
|
||||
use tokio::io::{AsyncReadExt, AsyncSeekExt};
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
#[async_trait::async_trait]
|
||||
pub trait Storage {
|
||||
/// LSN of last durably stored WAL record.
|
||||
fn flush_lsn(&self) -> Lsn;
|
||||
|
||||
/// Write piece of WAL from buf to disk, but not necessarily sync it.
|
||||
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
|
||||
/// Truncate WAL at specified LSN, which must be the end of WAL record.
|
||||
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
|
||||
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>;
|
||||
|
||||
/// Durably store WAL on disk, up to the last written WAL record.
|
||||
fn flush_wal(&mut self) -> Result<()>;
|
||||
async fn flush_wal(&mut self) -> Result<()>;
|
||||
|
||||
/// Remove all segments <= given segno. Returns closure as we want to do
|
||||
/// that without timeline lock.
|
||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>>;
|
||||
/// Remove all segments <= given segno. Returns function doing that as we
|
||||
/// want to perform it without timeline lock.
|
||||
fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>>;
|
||||
|
||||
/// Release resources associated with the storage -- technically, close FDs.
|
||||
/// Currently we don't remove timelines until restart (#3146), so need to
|
||||
@@ -105,6 +98,22 @@ pub struct PhysicalStorage {
|
||||
/// - points to write_lsn, so no seek is needed for writing
|
||||
/// - doesn't point to the end of the segment
|
||||
file: Option<File>,
|
||||
|
||||
/// When false, we have just initialized storage using the LSN from find_end_of_wal().
|
||||
/// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
|
||||
/// there can be a case with unexpected .partial file.
|
||||
///
|
||||
/// Imagine the following:
|
||||
/// - 000000010000000000000001
|
||||
/// - it was fully written, but the last record is split between 2 segments
|
||||
/// - after restart, find_end_of_wal() returned 0/1FFFFF0, which is in the end of this segment
|
||||
/// - write_lsn, write_record_lsn and flush_record_lsn were initialized to 0/1FFFFF0
|
||||
/// - 000000010000000000000002.partial
|
||||
/// - it has only 1 byte written, which is not enough to make a full WAL record
|
||||
///
|
||||
/// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
|
||||
/// This flag will be set to true after the first truncate_wal() call.
|
||||
is_truncated_after_restart: bool,
|
||||
}
|
||||
|
||||
impl PhysicalStorage {
|
||||
@@ -164,6 +173,7 @@ impl PhysicalStorage {
|
||||
flush_record_lsn: flush_lsn,
|
||||
decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
|
||||
file: None,
|
||||
is_truncated_after_restart: false,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -178,33 +188,37 @@ impl PhysicalStorage {
|
||||
}
|
||||
|
||||
/// Call fdatasync if config requires so.
|
||||
fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
|
||||
async fn fdatasync_file(&mut self, file: &mut File) -> Result<()> {
|
||||
if !self.conf.no_sync {
|
||||
self.metrics
|
||||
.observe_flush_seconds(time_io_closure(|| Ok(file.sync_data()?))?);
|
||||
.observe_flush_seconds(time_io_closure(file.sync_data()).await?);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call fsync if config requires so.
|
||||
fn fsync_file(&mut self, file: &mut File) -> Result<()> {
|
||||
async fn fsync_file(&mut self, file: &mut File) -> Result<()> {
|
||||
if !self.conf.no_sync {
|
||||
self.metrics
|
||||
.observe_flush_seconds(time_io_closure(|| Ok(file.sync_all()?))?);
|
||||
.observe_flush_seconds(time_io_closure(file.sync_all()).await?);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Open or create WAL segment file. Caller must call seek to the wanted position.
|
||||
/// Returns `file` and `is_partial`.
|
||||
fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
|
||||
async fn open_or_create(&mut self, segno: XLogSegNo) -> Result<(File, bool)> {
|
||||
let (wal_file_path, wal_file_partial_path) =
|
||||
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
|
||||
|
||||
// Try to open already completed segment
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path) {
|
||||
if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_path).await {
|
||||
Ok((file, false))
|
||||
} else if let Ok(file) = OpenOptions::new().write(true).open(&wal_file_partial_path) {
|
||||
} else if let Ok(file) = OpenOptions::new()
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
.await
|
||||
{
|
||||
// Try to open existing partial file
|
||||
Ok((file, true))
|
||||
} else {
|
||||
@@ -213,35 +227,36 @@ impl PhysicalStorage {
|
||||
.create(true)
|
||||
.write(true)
|
||||
.open(&wal_file_partial_path)
|
||||
.await
|
||||
.with_context(|| format!("Failed to open log file {:?}", &wal_file_path))?;
|
||||
|
||||
write_zeroes(&mut file, self.wal_seg_size)?;
|
||||
self.fsync_file(&mut file)?;
|
||||
write_zeroes(&mut file, self.wal_seg_size).await?;
|
||||
self.fsync_file(&mut file).await?;
|
||||
Ok((file, true))
|
||||
}
|
||||
}
|
||||
|
||||
/// Write WAL bytes, which are known to be located in a single WAL segment.
|
||||
fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
|
||||
async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
|
||||
let mut file = if let Some(file) = self.file.take() {
|
||||
file
|
||||
} else {
|
||||
let (mut file, is_partial) = self.open_or_create(segno)?;
|
||||
let (mut file, is_partial) = self.open_or_create(segno).await?;
|
||||
assert!(is_partial, "unexpected write into non-partial segment file");
|
||||
file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
||||
file
|
||||
};
|
||||
|
||||
file.write_all(buf)?;
|
||||
file.write_all(buf).await?;
|
||||
|
||||
if xlogoff + buf.len() == self.wal_seg_size {
|
||||
// If we reached the end of a WAL segment, flush and close it.
|
||||
self.fdatasync_file(&mut file)?;
|
||||
self.fdatasync_file(&mut file).await?;
|
||||
|
||||
// Rename partial file to completed file
|
||||
let (wal_file_path, wal_file_partial_path) =
|
||||
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
|
||||
fs::rename(wal_file_partial_path, wal_file_path)?;
|
||||
fs::rename(wal_file_partial_path, wal_file_path).await?;
|
||||
} else {
|
||||
// otherwise, file can be reused later
|
||||
self.file = Some(file);
|
||||
@@ -255,11 +270,11 @@ impl PhysicalStorage {
|
||||
/// be flushed separately later.
|
||||
///
|
||||
/// Updates `write_lsn`.
|
||||
fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
|
||||
async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
|
||||
if self.write_lsn != pos {
|
||||
// need to flush the file before discarding it
|
||||
if let Some(mut file) = self.file.take() {
|
||||
self.fdatasync_file(&mut file)?;
|
||||
self.fdatasync_file(&mut file).await?;
|
||||
}
|
||||
|
||||
self.write_lsn = pos;
|
||||
@@ -277,7 +292,8 @@ impl PhysicalStorage {
|
||||
buf.len()
|
||||
};
|
||||
|
||||
self.write_in_segment(segno, xlogoff, &buf[..bytes_write])?;
|
||||
self.write_in_segment(segno, xlogoff, &buf[..bytes_write])
|
||||
.await?;
|
||||
self.write_lsn += bytes_write as u64;
|
||||
buf = &buf[bytes_write..];
|
||||
}
|
||||
@@ -286,6 +302,7 @@ impl PhysicalStorage {
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl Storage for PhysicalStorage {
|
||||
/// flush_lsn returns LSN of last durably stored WAL record.
|
||||
fn flush_lsn(&self) -> Lsn {
|
||||
@@ -293,7 +310,7 @@ impl Storage for PhysicalStorage {
|
||||
}
|
||||
|
||||
/// Write WAL to disk.
|
||||
fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()> {
|
||||
// Disallow any non-sequential writes, which can result in gaps or overwrites.
|
||||
// If we need to move the pointer, use truncate_wal() instead.
|
||||
if self.write_lsn > startpos {
|
||||
@@ -311,7 +328,7 @@ impl Storage for PhysicalStorage {
|
||||
);
|
||||
}
|
||||
|
||||
let write_seconds = time_io_closure(|| self.write_exact(startpos, buf))?;
|
||||
let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
|
||||
// WAL is written, updating write metrics
|
||||
self.metrics.observe_write_seconds(write_seconds);
|
||||
self.metrics.observe_write_bytes(buf.len());
|
||||
@@ -340,14 +357,14 @@ impl Storage for PhysicalStorage {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_wal(&mut self) -> Result<()> {
|
||||
async fn flush_wal(&mut self) -> Result<()> {
|
||||
if self.flush_record_lsn == self.write_record_lsn {
|
||||
// no need to do extra flush
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if let Some(mut unflushed_file) = self.file.take() {
|
||||
self.fdatasync_file(&mut unflushed_file)?;
|
||||
self.fdatasync_file(&mut unflushed_file).await?;
|
||||
self.file = Some(unflushed_file);
|
||||
} else {
|
||||
// We have unflushed data (write_lsn != flush_lsn), but no file.
|
||||
@@ -369,7 +386,7 @@ impl Storage for PhysicalStorage {
|
||||
|
||||
/// Truncate written WAL by removing all WAL segments after the given LSN.
|
||||
/// end_pos must point to the end of the WAL record.
|
||||
fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
||||
async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
|
||||
// Streaming must not create a hole, so truncate cannot be called on non-written lsn
|
||||
if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
|
||||
bail!(
|
||||
@@ -381,47 +398,51 @@ impl Storage for PhysicalStorage {
|
||||
|
||||
// Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
|
||||
// disk (this happens on each connect).
|
||||
if end_pos == self.write_lsn {
|
||||
if self.is_truncated_after_restart
|
||||
&& end_pos == self.write_lsn
|
||||
&& end_pos == self.flush_record_lsn
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Close previously opened file, if any
|
||||
if let Some(mut unflushed_file) = self.file.take() {
|
||||
self.fdatasync_file(&mut unflushed_file)?;
|
||||
self.fdatasync_file(&mut unflushed_file).await?;
|
||||
}
|
||||
|
||||
let xlogoff = end_pos.segment_offset(self.wal_seg_size);
|
||||
let segno = end_pos.segment_number(self.wal_seg_size);
|
||||
|
||||
// Remove all segments after the given LSN.
|
||||
remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno)?;
|
||||
remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;
|
||||
|
||||
let (mut file, is_partial) = self.open_or_create(segno)?;
|
||||
let (mut file, is_partial) = self.open_or_create(segno).await?;
|
||||
|
||||
// Fill end with zeroes
|
||||
file.seek(SeekFrom::Start(xlogoff as u64))?;
|
||||
write_zeroes(&mut file, self.wal_seg_size - xlogoff)?;
|
||||
self.fdatasync_file(&mut file)?;
|
||||
file.seek(SeekFrom::Start(xlogoff as u64)).await?;
|
||||
write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
|
||||
self.fdatasync_file(&mut file).await?;
|
||||
|
||||
if !is_partial {
|
||||
// Make segment partial once again
|
||||
let (wal_file_path, wal_file_partial_path) =
|
||||
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size)?;
|
||||
fs::rename(wal_file_path, wal_file_partial_path)?;
|
||||
fs::rename(wal_file_path, wal_file_partial_path).await?;
|
||||
}
|
||||
|
||||
// Update LSNs
|
||||
self.write_lsn = end_pos;
|
||||
self.write_record_lsn = end_pos;
|
||||
self.flush_record_lsn = end_pos;
|
||||
self.is_truncated_after_restart = true;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>> {
|
||||
fn remove_up_to(&self, segno_up_to: XLogSegNo) -> BoxFuture<'static, anyhow::Result<()>> {
|
||||
let timeline_dir = self.timeline_dir.clone();
|
||||
let wal_seg_size = self.wal_seg_size;
|
||||
Box::new(move |segno_up_to: XLogSegNo| {
|
||||
remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to)
|
||||
Box::pin(async move {
|
||||
remove_segments_from_disk(&timeline_dir, wal_seg_size, |x| x <= segno_up_to).await
|
||||
})
|
||||
}
|
||||
|
||||
@@ -436,7 +457,7 @@ impl Storage for PhysicalStorage {
|
||||
}
|
||||
|
||||
/// Remove all WAL segments in timeline_dir that match the given predicate.
|
||||
fn remove_segments_from_disk(
|
||||
async fn remove_segments_from_disk(
|
||||
timeline_dir: &Path,
|
||||
wal_seg_size: usize,
|
||||
remove_predicate: impl Fn(XLogSegNo) -> bool,
|
||||
@@ -445,8 +466,8 @@ fn remove_segments_from_disk(
|
||||
let mut min_removed = u64::MAX;
|
||||
let mut max_removed = u64::MIN;
|
||||
|
||||
for entry in fs::read_dir(timeline_dir)? {
|
||||
let entry = entry?;
|
||||
let mut entries = fs::read_dir(timeline_dir).await?;
|
||||
while let Some(entry) = entries.next_entry().await? {
|
||||
let entry_path = entry.path();
|
||||
let fname = entry_path.file_name().unwrap();
|
||||
|
||||
@@ -457,7 +478,7 @@ fn remove_segments_from_disk(
|
||||
}
|
||||
let (segno, _) = XLogFromFileName(fname_str, wal_seg_size);
|
||||
if remove_predicate(segno) {
|
||||
remove_file(entry_path)?;
|
||||
remove_file(entry_path).await?;
|
||||
n_removed += 1;
|
||||
min_removed = min(min_removed, segno);
|
||||
max_removed = max(max_removed, segno);
|
||||
@@ -689,12 +710,12 @@ impl WalReader {
|
||||
const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
|
||||
|
||||
/// Helper for filling file with zeroes.
|
||||
fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
|
||||
async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
|
||||
while count >= XLOG_BLCKSZ {
|
||||
file.write_all(ZERO_BLOCK)?;
|
||||
file.write_all(ZERO_BLOCK).await?;
|
||||
count -= XLOG_BLCKSZ;
|
||||
}
|
||||
file.write_all(&ZERO_BLOCK[0..count])?;
|
||||
file.write_all(&ZERO_BLOCK[0..count]).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
191
scripts/comment-test-report.js
Normal file → Executable file
191
scripts/comment-test-report.js
Normal file → Executable file
@@ -1,3 +1,5 @@
|
||||
#! /usr/bin/env node
|
||||
|
||||
//
|
||||
// The script parses Allure reports and posts a comment with a summary of the test results to the PR or to the latest commit in the branch.
|
||||
//
|
||||
@@ -19,7 +21,7 @@
|
||||
// })
|
||||
//
|
||||
|
||||
// Analog of Python's defaultdict.
|
||||
// Equivalent of Python's defaultdict.
|
||||
//
|
||||
// const dm = new DefaultMap(() => new DefaultMap(() => []))
|
||||
// dm["firstKey"]["secondKey"].push("value")
|
||||
@@ -32,34 +34,7 @@ class DefaultMap extends Map {
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = async ({ github, context, fetch, report }) => {
|
||||
// Marker to find the comment in the subsequent runs
|
||||
const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
|
||||
// If we run the script in the PR or in the branch (main/release/...)
|
||||
const isPullRequest = !!context.payload.pull_request
|
||||
// Latest commit in PR or in the branch
|
||||
const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
|
||||
// Let users know that the comment is updated automatically
|
||||
const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
|
||||
// GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
|
||||
const githubActionsBotId = 41898282
|
||||
// Commend body itself
|
||||
let commentBody = `${startMarker}\n`
|
||||
|
||||
// Common parameters for GitHub API requests
|
||||
const ownerRepoParams = {
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
}
|
||||
|
||||
const {reportUrl, reportJsonUrl} = report
|
||||
|
||||
if (!reportUrl || !reportJsonUrl) {
|
||||
commentBody += `#### No tests were run or test report is not available\n`
|
||||
commentBody += autoupdateNotice
|
||||
return
|
||||
}
|
||||
|
||||
const parseReportJson = async ({ reportJsonUrl, fetch }) => {
|
||||
const suites = await (await fetch(reportJsonUrl)).json()
|
||||
|
||||
// Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
|
||||
@@ -83,7 +58,7 @@ module.exports = async ({ github, context, fetch, report }) => {
|
||||
let buildType, pgVersion
|
||||
const match = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)?.groups
|
||||
if (match) {
|
||||
({buildType, pgVersion} = match)
|
||||
({ buildType, pgVersion } = match)
|
||||
} else {
|
||||
// It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance).
|
||||
console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
|
||||
@@ -123,37 +98,68 @@ module.exports = async ({ github, context, fetch, report }) => {
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
failedTests,
|
||||
failedTestsCount,
|
||||
passedTests,
|
||||
passedTestsCount,
|
||||
skippedTests,
|
||||
skippedTestsCount,
|
||||
flakyTests,
|
||||
flakyTestsCount,
|
||||
retriedTests,
|
||||
pgVersions,
|
||||
}
|
||||
}
|
||||
|
||||
const reportSummary = async (params) => {
|
||||
const {
|
||||
failedTests,
|
||||
failedTestsCount,
|
||||
passedTests,
|
||||
passedTestsCount,
|
||||
skippedTests,
|
||||
skippedTestsCount,
|
||||
flakyTests,
|
||||
flakyTestsCount,
|
||||
retriedTests,
|
||||
pgVersions,
|
||||
reportUrl,
|
||||
} = params
|
||||
|
||||
let summary = ""
|
||||
|
||||
const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount
|
||||
commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}))\n___\n`
|
||||
summary += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}))\n___\n`
|
||||
|
||||
// Print test resuls from the newest to the oldest Postgres version for release and debug builds.
|
||||
for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
|
||||
if (Object.keys(failedTests[pgVersion]).length > 0) {
|
||||
commentBody += `#### Failures on Posgres ${pgVersion}\n\n`
|
||||
summary += `#### Failures on Posgres ${pgVersion}\n\n`
|
||||
for (const [testName, tests] of Object.entries(failedTests[pgVersion])) {
|
||||
const links = []
|
||||
for (const test of tests) {
|
||||
const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
|
||||
links.push(`[${test.buildType}](${allureLink})`)
|
||||
}
|
||||
commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
|
||||
summary += `- \`${testName}\`: ${links.join(", ")}\n`
|
||||
}
|
||||
|
||||
const testsToRerun = Object.values(failedTests[pgVersion]).map(x => x[0].name)
|
||||
const command = `DEFAULT_PG_VERSION=${pgVersion} scripts/pytest -k "${testsToRerun.join(" or ")}"`
|
||||
|
||||
commentBody += "```\n"
|
||||
commentBody += `# Run failed on Postgres ${pgVersion} tests locally:\n`
|
||||
commentBody += `${command}\n`
|
||||
commentBody += "```\n"
|
||||
summary += "```\n"
|
||||
summary += `# Run failed on Postgres ${pgVersion} tests locally:\n`
|
||||
summary += `${command}\n`
|
||||
summary += "```\n"
|
||||
}
|
||||
}
|
||||
|
||||
if (flakyTestsCount > 0) {
|
||||
commentBody += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
|
||||
summary += `<details>\n<summary>Flaky tests (${flakyTestsCount})</summary>\n\n`
|
||||
for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
|
||||
if (Object.keys(flakyTests[pgVersion]).length > 0) {
|
||||
commentBody += `#### Postgres ${pgVersion}\n\n`
|
||||
summary += `#### Postgres ${pgVersion}\n\n`
|
||||
for (const [testName, tests] of Object.entries(flakyTests[pgVersion])) {
|
||||
const links = []
|
||||
for (const test of tests) {
|
||||
@@ -161,11 +167,57 @@ module.exports = async ({ github, context, fetch, report }) => {
|
||||
const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
|
||||
links.push(`[${status} ${test.buildType}](${allureLink})`)
|
||||
}
|
||||
commentBody += `- \`${testName}\`: ${links.join(", ")}\n`
|
||||
summary += `- \`${testName}\`: ${links.join(", ")}\n`
|
||||
}
|
||||
}
|
||||
}
|
||||
commentBody += "\n</details>\n"
|
||||
summary += "\n</details>\n"
|
||||
}
|
||||
|
||||
return summary
|
||||
}
|
||||
|
||||
module.exports = async ({ github, context, fetch, report }) => {
|
||||
// Marker to find the comment in the subsequent runs
|
||||
const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
|
||||
// If we run the script in the PR or in the branch (main/release/...)
|
||||
const isPullRequest = !!context.payload.pull_request
|
||||
// Latest commit in PR or in the branch
|
||||
const commitSha = isPullRequest ? context.payload.pull_request.head.sha : context.sha
|
||||
// Let users know that the comment is updated automatically
|
||||
const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results<br>${commitSha} at ${new Date().toISOString()} :recycle:</sub></div>`
|
||||
// GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
|
||||
const githubActionsBotId = 41898282
|
||||
// Commend body itself
|
||||
let commentBody = `${startMarker}\n`
|
||||
|
||||
// Common parameters for GitHub API requests
|
||||
const ownerRepoParams = {
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
}
|
||||
|
||||
const {reportUrl, reportJsonUrl} = report
|
||||
|
||||
if (!reportUrl || !reportJsonUrl) {
|
||||
commentBody += `#### No tests were run or test report is not available\n`
|
||||
commentBody += autoupdateNotice
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
const parsed = await parseReportJson({ reportJsonUrl, fetch })
|
||||
commentBody += await reportSummary({ ...parsed, reportUrl })
|
||||
} catch (error) {
|
||||
commentBody += `### [full report](${reportUrl})\n___\n`
|
||||
commentBody += `#### Failed to create a summary for the test run: \n`
|
||||
commentBody += "```\n"
|
||||
commentBody += `${error.stack}\n`
|
||||
commentBody += "```\n"
|
||||
commentBody += "\nTo reproduce and debug the error locally run:\n"
|
||||
commentBody += "```\n"
|
||||
commentBody += `scripts/comment-test-report.js ${reportJsonUrl}`
|
||||
commentBody += "\n```\n"
|
||||
}
|
||||
|
||||
commentBody += autoupdateNotice
|
||||
@@ -207,3 +259,60 @@ module.exports = async ({ github, context, fetch, report }) => {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Equivalent of Python's `if __name__ == "__main__":`
|
||||
// https://nodejs.org/docs/latest/api/modules.html#accessing-the-main-module
|
||||
if (require.main === module) {
|
||||
// Poor man's argument parsing: we expect the third argument is a JSON URL (0: node binary, 1: this script, 2: JSON url)
|
||||
if (process.argv.length !== 3) {
|
||||
console.error(`Unexpected number of arguments\nUsage: node ${process.argv[1]} <jsonUrl>`)
|
||||
process.exit(1)
|
||||
}
|
||||
const jsonUrl = process.argv[2]
|
||||
|
||||
try {
|
||||
new URL(jsonUrl)
|
||||
} catch (error) {
|
||||
console.error(`Invalid URL: ${jsonUrl}\nUsage: node ${process.argv[1]} <jsonUrl>`)
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const htmlUrl = jsonUrl.replace("/data/suites.json", "/index.html")
|
||||
|
||||
const githubMock = {
|
||||
rest: {
|
||||
issues: {
|
||||
createComment: console.log,
|
||||
listComments: async () => ({ data: [] }),
|
||||
updateComment: console.log
|
||||
},
|
||||
repos: {
|
||||
createCommitComment: console.log,
|
||||
listCommentsForCommit: async () => ({ data: [] }),
|
||||
updateCommitComment: console.log
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const contextMock = {
|
||||
repo: {
|
||||
owner: 'testOwner',
|
||||
repo: 'testRepo'
|
||||
},
|
||||
payload: {
|
||||
number: 42,
|
||||
pull_request: null,
|
||||
},
|
||||
sha: '0000000000000000000000000000000000000000',
|
||||
}
|
||||
|
||||
module.exports({
|
||||
github: githubMock,
|
||||
context: contextMock,
|
||||
fetch: fetch,
|
||||
report: {
|
||||
reportUrl: htmlUrl,
|
||||
reportJsonUrl: jsonUrl,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import backoff
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
@@ -35,9 +37,20 @@ def get_connection_cursor():
|
||||
connstr = os.getenv("DATABASE_URL")
|
||||
if not connstr:
|
||||
err("DATABASE_URL environment variable is not set")
|
||||
with psycopg2.connect(connstr, connect_timeout=30) as conn:
|
||||
|
||||
@backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150)
|
||||
def connect(connstr):
|
||||
conn = psycopg2.connect(connstr, connect_timeout=30)
|
||||
conn.autocommit = True
|
||||
return conn
|
||||
|
||||
conn = connect(connstr)
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
yield cur
|
||||
finally:
|
||||
if conn is not None:
|
||||
conn.close()
|
||||
|
||||
|
||||
def create_table(cur):
|
||||
@@ -115,6 +128,7 @@ def main():
|
||||
parser.add_argument(
|
||||
"--ingest",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Path to perf test result file, or directory with perf test result files",
|
||||
)
|
||||
parser.add_argument("--initdb", action="store_true", help="Initialuze database")
|
||||
@@ -140,4 +154,5 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.getLogger("backoff").addHandler(logging.StreamHandler())
|
||||
main()
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
import backoff
|
||||
import psycopg2
|
||||
|
||||
CREATE_TABLE = """
|
||||
@@ -29,9 +31,20 @@ def get_connection_cursor():
|
||||
connstr = os.getenv("DATABASE_URL")
|
||||
if not connstr:
|
||||
err("DATABASE_URL environment variable is not set")
|
||||
with psycopg2.connect(connstr, connect_timeout=30) as conn:
|
||||
|
||||
@backoff.on_exception(backoff.expo, psycopg2.OperationalError, max_time=150)
|
||||
def connect(connstr):
|
||||
conn = psycopg2.connect(connstr, connect_timeout=30)
|
||||
conn.autocommit = True
|
||||
return conn
|
||||
|
||||
conn = connect(connstr)
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
yield cur
|
||||
finally:
|
||||
if conn is not None:
|
||||
conn.close()
|
||||
|
||||
|
||||
def create_table(cur):
|
||||
@@ -101,4 +114,5 @@ def main():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.getLogger("backoff").addHandler(logging.StreamHandler())
|
||||
main()
|
||||
|
||||
@@ -32,6 +32,7 @@ pub const DEFAULT_LISTEN_ADDR: &str = "127.0.0.1:50051";
|
||||
pub const DEFAULT_ENDPOINT: &str = const_format::formatcp!("http://{DEFAULT_LISTEN_ADDR}");
|
||||
|
||||
pub const DEFAULT_KEEPALIVE_INTERVAL: &str = "5000 ms";
|
||||
pub const DEFAULT_CONNECT_TIMEOUT: Duration = Duration::from_millis(5000);
|
||||
|
||||
// BrokerServiceClient charged with tonic provided Channel transport; helps to
|
||||
// avoid depending on tonic directly in user crates.
|
||||
@@ -58,7 +59,8 @@ where
|
||||
}
|
||||
tonic_endpoint = tonic_endpoint
|
||||
.http2_keep_alive_interval(keepalive_interval)
|
||||
.keep_alive_while_idle(true);
|
||||
.keep_alive_while_idle(true)
|
||||
.connect_timeout(DEFAULT_CONNECT_TIMEOUT);
|
||||
// keep_alive_timeout is 20s by default on both client and server side
|
||||
let channel = tonic_endpoint.connect_lazy();
|
||||
Ok(BrokerClientChannel::new(channel))
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Any, List, MutableMapping, cast
|
||||
|
||||
import pytest
|
||||
from _pytest.config import Config
|
||||
@@ -56,3 +56,15 @@ def pytest_collection_modifyitems(config: Config, items: List[pytest.Item]):
|
||||
# Rerun 3 times = 1 original run + 2 reruns
|
||||
log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
|
||||
item.add_marker(pytest.mark.flaky(reruns=2))
|
||||
|
||||
# pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
|
||||
# we can workaround it by setting `timeout_func_only` to True[1].
|
||||
# Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
|
||||
# but we still can do it using pytest marker.
|
||||
#
|
||||
# - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
|
||||
# - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
|
||||
timeout_marker = item.get_closest_marker("timeout")
|
||||
if timeout_marker is not None:
|
||||
kwargs = cast(MutableMapping[str, Any], timeout_marker.kwargs)
|
||||
kwargs["func_only"] = True
|
||||
|
||||
@@ -57,14 +57,17 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"libmetrics_launch_timestamp",
|
||||
"libmetrics_build_info",
|
||||
"libmetrics_tracing_event_count_total",
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"pageserver_getpage_reconstruct_seconds_bucket",
|
||||
"pageserver_getpage_reconstruct_seconds_count",
|
||||
"pageserver_getpage_reconstruct_seconds_sum",
|
||||
*[f"pageserver_basebackup_query_seconds_{x}" for x in ["bucket", "count", "sum"]],
|
||||
)
|
||||
|
||||
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_current_logical_size",
|
||||
"pageserver_resident_physical_size",
|
||||
"pageserver_getpage_reconstruct_seconds_bucket",
|
||||
"pageserver_getpage_reconstruct_seconds_count",
|
||||
"pageserver_getpage_reconstruct_seconds_sum",
|
||||
"pageserver_getpage_get_reconstruct_data_seconds_bucket",
|
||||
"pageserver_getpage_get_reconstruct_data_seconds_count",
|
||||
"pageserver_getpage_get_reconstruct_data_seconds_sum",
|
||||
@@ -73,8 +76,6 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_io_operations_seconds_count",
|
||||
"pageserver_io_operations_seconds_sum",
|
||||
"pageserver_last_record_lsn",
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"pageserver_read_num_fs_layers_bucket",
|
||||
"pageserver_read_num_fs_layers_count",
|
||||
"pageserver_read_num_fs_layers_sum",
|
||||
|
||||
@@ -534,6 +534,16 @@ class S3Storage:
|
||||
"AWS_SECRET_ACCESS_KEY": self.secret_key,
|
||||
}
|
||||
|
||||
def to_string(self) -> str:
|
||||
return json.dumps(
|
||||
{
|
||||
"bucket": self.bucket_name,
|
||||
"region": self.bucket_region,
|
||||
"endpoint": self.endpoint,
|
||||
"prefix": self.prefix_in_bucket,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
RemoteStorage = Union[LocalFsStorage, S3Storage]
|
||||
|
||||
@@ -600,10 +610,12 @@ class NeonEnvBuilder:
|
||||
self.rust_log_override = rust_log_override
|
||||
self.port_distributor = port_distributor
|
||||
self.remote_storage = remote_storage
|
||||
self.ext_remote_storage: Optional[S3Storage] = None
|
||||
self.remote_storage_client: Optional[Any] = None
|
||||
self.remote_storage_users = remote_storage_users
|
||||
self.broker = broker
|
||||
self.run_id = run_id
|
||||
self.mock_s3_server = mock_s3_server
|
||||
self.mock_s3_server: MockS3Server = mock_s3_server
|
||||
self.pageserver_config_override = pageserver_config_override
|
||||
self.num_safekeepers = num_safekeepers
|
||||
self.safekeepers_id_start = safekeepers_id_start
|
||||
@@ -651,15 +663,24 @@ class NeonEnvBuilder:
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
test_name: str,
|
||||
force_enable: bool = True,
|
||||
enable_remote_extensions: bool = False,
|
||||
):
|
||||
if remote_storage_kind == RemoteStorageKind.NOOP:
|
||||
return
|
||||
elif remote_storage_kind == RemoteStorageKind.LOCAL_FS:
|
||||
self.enable_local_fs_remote_storage(force_enable=force_enable)
|
||||
elif remote_storage_kind == RemoteStorageKind.MOCK_S3:
|
||||
self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable)
|
||||
self.enable_mock_s3_remote_storage(
|
||||
bucket_name=test_name,
|
||||
force_enable=force_enable,
|
||||
enable_remote_extensions=enable_remote_extensions,
|
||||
)
|
||||
elif remote_storage_kind == RemoteStorageKind.REAL_S3:
|
||||
self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable)
|
||||
self.enable_real_s3_remote_storage(
|
||||
test_name=test_name,
|
||||
force_enable=force_enable,
|
||||
enable_remote_extensions=enable_remote_extensions,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")
|
||||
|
||||
@@ -673,11 +694,15 @@ class NeonEnvBuilder:
|
||||
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
|
||||
self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage"))
|
||||
|
||||
def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True):
|
||||
def enable_mock_s3_remote_storage(
|
||||
self, bucket_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
|
||||
):
|
||||
"""
|
||||
Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already.
|
||||
Starts up the mock server, if that does not run yet.
|
||||
Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`.
|
||||
|
||||
Also creates the bucket for extensions, self.ext_remote_storage bucket
|
||||
"""
|
||||
assert force_enable or self.remote_storage is None, "remote storage is enabled already"
|
||||
mock_endpoint = self.mock_s3_server.endpoint()
|
||||
@@ -698,9 +723,22 @@ class NeonEnvBuilder:
|
||||
bucket_region=mock_region,
|
||||
access_key=self.mock_s3_server.access_key(),
|
||||
secret_key=self.mock_s3_server.secret_key(),
|
||||
prefix_in_bucket="pageserver",
|
||||
)
|
||||
|
||||
def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True):
|
||||
if enable_remote_extensions:
|
||||
self.ext_remote_storage = S3Storage(
|
||||
bucket_name=bucket_name,
|
||||
endpoint=mock_endpoint,
|
||||
bucket_region=mock_region,
|
||||
access_key=self.mock_s3_server.access_key(),
|
||||
secret_key=self.mock_s3_server.secret_key(),
|
||||
prefix_in_bucket="ext",
|
||||
)
|
||||
|
||||
def enable_real_s3_remote_storage(
|
||||
self, test_name: str, force_enable: bool = True, enable_remote_extensions: bool = False
|
||||
):
|
||||
"""
|
||||
Sets up configuration to use real s3 endpoint without mock server
|
||||
"""
|
||||
@@ -737,9 +775,18 @@ class NeonEnvBuilder:
|
||||
bucket_region=region,
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
prefix_in_bucket=self.remote_storage_prefix,
|
||||
prefix_in_bucket=f"{self.remote_storage_prefix}/pageserver",
|
||||
)
|
||||
|
||||
if enable_remote_extensions:
|
||||
self.ext_remote_storage = S3Storage(
|
||||
bucket_name=bucket_name,
|
||||
bucket_region=region,
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
prefix_in_bucket=f"{self.remote_storage_prefix}/ext",
|
||||
)
|
||||
|
||||
def cleanup_local_storage(self):
|
||||
if self.preserve_database_files:
|
||||
return
|
||||
@@ -773,6 +820,7 @@ class NeonEnvBuilder:
|
||||
# `self.remote_storage_prefix` is coupled with `S3Storage` storage type,
|
||||
# so this line effectively a no-op
|
||||
assert isinstance(self.remote_storage, S3Storage)
|
||||
assert self.remote_storage_client is not None
|
||||
|
||||
if self.keep_remote_storage_contents:
|
||||
log.info("keep_remote_storage_contents skipping remote storage cleanup")
|
||||
@@ -902,6 +950,8 @@ class NeonEnv:
|
||||
self.neon_binpath = config.neon_binpath
|
||||
self.pg_distrib_dir = config.pg_distrib_dir
|
||||
self.endpoint_counter = 0
|
||||
self.remote_storage_client = config.remote_storage_client
|
||||
self.ext_remote_storage = config.ext_remote_storage
|
||||
|
||||
# generate initial tenant ID here instead of letting 'neon init' generate it,
|
||||
# so that we don't need to dig it out of the config file afterwards.
|
||||
@@ -1488,6 +1538,7 @@ class NeonCli(AbstractNeonCli):
|
||||
safekeepers: Optional[List[int]] = None,
|
||||
tenant_id: Optional[TenantId] = None,
|
||||
lsn: Optional[Lsn] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
args = [
|
||||
"endpoint",
|
||||
@@ -1497,6 +1548,8 @@ class NeonCli(AbstractNeonCli):
|
||||
"--pg-version",
|
||||
self.env.pg_version,
|
||||
]
|
||||
if remote_ext_config is not None:
|
||||
args.extend(["--remote-ext-config", remote_ext_config])
|
||||
if lsn is not None:
|
||||
args.append(f"--lsn={lsn}")
|
||||
args.extend(["--pg-port", str(pg_port)])
|
||||
@@ -1631,6 +1684,8 @@ class NeonPageserver(PgProtocol):
|
||||
r".*ERROR.*ancestor timeline \S+ is being stopped",
|
||||
# this is expected given our collaborative shutdown approach for the UploadQueue
|
||||
".*Compaction failed, retrying in .*: queue is in state Stopped.*",
|
||||
# Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
|
||||
".*Error processing HTTP request: NotFound: Timeline .* was not found",
|
||||
]
|
||||
|
||||
def start(
|
||||
@@ -2356,7 +2411,7 @@ class Endpoint(PgProtocol):
|
||||
|
||||
return self
|
||||
|
||||
def start(self) -> "Endpoint":
|
||||
def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint":
|
||||
"""
|
||||
Start the Postgres instance.
|
||||
Returns self.
|
||||
@@ -2372,6 +2427,7 @@ class Endpoint(PgProtocol):
|
||||
http_port=self.http_port,
|
||||
tenant_id=self.tenant_id,
|
||||
safekeepers=self.active_safekeepers,
|
||||
remote_ext_config=remote_ext_config,
|
||||
)
|
||||
self.running = True
|
||||
|
||||
@@ -2413,6 +2469,17 @@ class Endpoint(PgProtocol):
|
||||
|
||||
return self
|
||||
|
||||
def respec(self, **kwargs):
|
||||
"""Update the endpoint.json file used by control_plane."""
|
||||
# Read config
|
||||
config_path = os.path.join(self.endpoint_path(), "endpoint.json")
|
||||
with open(config_path, "r") as f:
|
||||
data_dict = json.load(f)
|
||||
|
||||
# Write it back updated
|
||||
with open(config_path, "w") as file:
|
||||
json.dump(dict(data_dict, **kwargs), file, indent=4)
|
||||
|
||||
def stop(self) -> "Endpoint":
|
||||
"""
|
||||
Stop the Postgres instance if it's running.
|
||||
@@ -2450,6 +2517,7 @@ class Endpoint(PgProtocol):
|
||||
hot_standby: bool = False,
|
||||
lsn: Optional[Lsn] = None,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
) -> "Endpoint":
|
||||
"""
|
||||
Create an endpoint, apply config, and start Postgres.
|
||||
@@ -2464,7 +2532,7 @@ class Endpoint(PgProtocol):
|
||||
config_lines=config_lines,
|
||||
hot_standby=hot_standby,
|
||||
lsn=lsn,
|
||||
).start()
|
||||
).start(remote_ext_config=remote_ext_config)
|
||||
|
||||
log.info(f"Postgres startup took {time.time() - started_at} seconds")
|
||||
|
||||
@@ -2498,6 +2566,7 @@ class EndpointFactory:
|
||||
lsn: Optional[Lsn] = None,
|
||||
hot_standby: bool = False,
|
||||
config_lines: Optional[List[str]] = None,
|
||||
remote_ext_config: Optional[str] = None,
|
||||
) -> Endpoint:
|
||||
ep = Endpoint(
|
||||
self.env,
|
||||
@@ -2514,6 +2583,7 @@ class EndpointFactory:
|
||||
hot_standby=hot_standby,
|
||||
config_lines=config_lines,
|
||||
lsn=lsn,
|
||||
remote_ext_config=remote_ext_config,
|
||||
)
|
||||
|
||||
def create(
|
||||
|
||||
@@ -342,6 +342,11 @@ class PageserverHttpClient(requests.Session):
|
||||
return res_json
|
||||
|
||||
def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
|
||||
"""
|
||||
Note that deletion is not instant, it is scheduled and performed mostly in the background.
|
||||
So if you need to wait for it to complete use `timeline_delete_wait_completed`.
|
||||
For longer description consult with pageserver openapi spec.
|
||||
"""
|
||||
res = self.delete(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs
|
||||
)
|
||||
|
||||
@@ -193,19 +193,30 @@ def wait_for_upload_queue_empty(
|
||||
time.sleep(0.2)
|
||||
|
||||
|
||||
def assert_timeline_detail_404(
|
||||
def wait_timeline_detail_404(
|
||||
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
|
||||
):
|
||||
last_exc = None
|
||||
for _ in range(2):
|
||||
time.sleep(0.250)
|
||||
try:
|
||||
data = pageserver_http.timeline_detail(tenant_id, timeline_id)
|
||||
log.error(f"detail {data}")
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
|
||||
last_exc = e
|
||||
|
||||
raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")
|
||||
|
||||
|
||||
def timeline_delete_wait_completed(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
**delete_args,
|
||||
):
|
||||
"""Asserts that timeline_detail returns 404, or dumps the detail."""
|
||||
try:
|
||||
data = pageserver_http.timeline_detail(tenant_id, timeline_id)
|
||||
log.error(f"detail {data}")
|
||||
except PageserverApiException as e:
|
||||
log.error(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
else:
|
||||
raise
|
||||
raise Exception("detail succeeded (it should return 404)")
|
||||
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
|
||||
|
||||
@@ -89,6 +89,9 @@ class TenantId(Id):
|
||||
def __repr__(self) -> str:
|
||||
return f'`TenantId("{self.id.hex()}")'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.id.hex()
|
||||
|
||||
|
||||
class TimelineId(Id):
|
||||
def __repr__(self) -> str:
|
||||
|
||||
@@ -32,13 +32,18 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
|
||||
|
||||
env.neon_cli.create_branch("test_startup")
|
||||
|
||||
endpoint = None
|
||||
|
||||
# We do two iterations so we can see if the second startup is faster. It should
|
||||
# be because the compute node should already be configured with roles, databases,
|
||||
# extensions, etc from the first run.
|
||||
for i in range(2):
|
||||
# Start
|
||||
with zenbenchmark.record_duration(f"{i}_start_and_select"):
|
||||
endpoint = env.endpoints.create_start("test_startup")
|
||||
if endpoint:
|
||||
endpoint.start()
|
||||
else:
|
||||
endpoint = env.endpoints.create_start("test_startup")
|
||||
endpoint.safe_psql("select 1;")
|
||||
|
||||
# Get metrics
|
||||
@@ -57,6 +62,9 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
|
||||
# Stop so we can restart
|
||||
endpoint.stop()
|
||||
|
||||
# Imitate optimizations that console would do for the second start
|
||||
endpoint.respec(skip_pg_catalog_updates=True)
|
||||
|
||||
|
||||
# This test sometimes runs for longer than the global 5 minute timeout.
|
||||
@pytest.mark.timeout(600)
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
pg8000==1.29.4
|
||||
pg8000==1.29.8
|
||||
scramp>=1.4.3
|
||||
|
||||
@@ -396,9 +396,9 @@ checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.52"
|
||||
version = "0.10.55"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
|
||||
checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
@@ -428,9 +428,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.87"
|
||||
version = "0.9.90"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
|
||||
checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM rust:1.69
|
||||
FROM rust:1.70
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user