mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 20:50:37 +00:00
Compare commits
129 Commits
problame/d
...
release-20
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5c1f25b11a | ||
|
|
c19cb7f386 | ||
|
|
45bf76eb05 | ||
|
|
a415670bc3 | ||
|
|
cf5cfe6d71 | ||
|
|
d733bc54b8 | ||
|
|
814abd9f84 | ||
|
|
75ffe34b17 | ||
|
|
d2aa31f0ce | ||
|
|
22f9ea5fe2 | ||
|
|
d0711d0896 | ||
|
|
271f6a6e99 | ||
|
|
a64dd3ecb5 | ||
|
|
bf46237fc2 | ||
|
|
41d364a8f1 | ||
|
|
fa54a57ca2 | ||
|
|
1c1bb904ed | ||
|
|
b26c837ed6 | ||
|
|
ac9c7e8c4a | ||
|
|
f1b174dc6a | ||
|
|
9d714a8413 | ||
|
|
6c84cbbb58 | ||
|
|
1300dc9239 | ||
|
|
018c8b0e2b | ||
|
|
b52389f228 | ||
|
|
5a123b56e5 | ||
|
|
7456e5b71c | ||
|
|
4b97d31b16 | ||
|
|
9798737ec6 | ||
|
|
35ecb139dc | ||
|
|
278d0f117d | ||
|
|
c30b9e6eb1 | ||
|
|
82a4777046 | ||
|
|
6efea43449 | ||
|
|
f14895b48e | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
10
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
10
.github/ansible/prod.ap-southeast-1.hosts.yaml
vendored
@@ -8,6 +8,16 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
10
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
10
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -8,6 +8,16 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
10
.github/ansible/prod.us-east-2.hosts.yaml
vendored
10
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -8,6 +8,16 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
10
.github/ansible/prod.us-west-2.hosts.yaml
vendored
10
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -8,6 +8,16 @@ storage:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80
|
||||
min_avail_bytes: 0
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "10m"
|
||||
threshold: &default_eviction_threshold "24h"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
7
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
7
.github/ansible/staging.eu-west-1.hosts.yaml
vendored
@@ -10,17 +10,14 @@ storage:
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
# TODO: learn typical resident-size growth rate [GiB/minute] and configure
|
||||
# min_avail_bytes such that we have X minutes of headroom.
|
||||
min_avail_bytes: 0
|
||||
# We assume that the worst-case growth rate is small enough that we can
|
||||
# catch above-threshold conditions by checking every 10s.
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
7
.github/ansible/staging.us-east-2.hosts.yaml
vendored
7
.github/ansible/staging.us-east-2.hosts.yaml
vendored
@@ -10,17 +10,14 @@ storage:
|
||||
metric_collection_interval: 10min
|
||||
disk_usage_based_eviction:
|
||||
max_usage_pct: 80
|
||||
# TODO: learn typical resident-size growth rate [GiB/minute] and configure
|
||||
# min_avail_bytes such that we have X minutes of headroom.
|
||||
min_avail_bytes: 0
|
||||
# We assume that the worst-case growth rate is small enough that we can
|
||||
# catch above-threshold conditions by checking every 10s.
|
||||
period: "10s"
|
||||
tenant_config:
|
||||
eviction_policy:
|
||||
kind: "LayerAccessThreshold"
|
||||
period: "20m"
|
||||
threshold: "20m"
|
||||
threshold: &default_eviction_threshold "20m"
|
||||
evictions_low_residence_duration_metric_threshold: *default_eviction_threshold
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
@@ -30,10 +30,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: dev
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: eu-west-1
|
||||
neon_service: proxy-scram
|
||||
neon_env: dev
|
||||
neon_region: eu-west-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -15,10 +15,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
podLabels:
|
||||
zenith_service: proxy
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
neon_service: proxy
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
|
||||
service:
|
||||
type: LoadBalancer
|
||||
|
||||
@@ -15,10 +15,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram-legacy
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
neon_service: proxy-scram-legacy
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -30,10 +30,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: dev
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
neon_service: proxy-scram
|
||||
neon_env: dev
|
||||
neon_region: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,10 +31,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: ap-southeast-1
|
||||
zenith_region_slug: ap-southeast-1
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: ap-southeast-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,10 +31,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: eu-central-1
|
||||
zenith_region_slug: eu-central-1
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: eu-central-1
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -13,10 +13,9 @@ settings:
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy
|
||||
zenith_env: production
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
neon_service: proxy
|
||||
neon_env: production
|
||||
neon_region: us-east-2
|
||||
|
||||
service:
|
||||
type: LoadBalancer
|
||||
|
||||
@@ -31,10 +31,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-east-2
|
||||
zenith_region_slug: us-east-2
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-east-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,10 +31,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-west-2
|
||||
zenith_region_slug: us-west-2
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-west-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
@@ -31,10 +31,9 @@ settings:
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: prod
|
||||
zenith_region: us-west-2
|
||||
zenith_region_slug: us-west-2
|
||||
neon_service: proxy-scram
|
||||
neon_env: prod
|
||||
neon_region: us-west-2
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
|
||||
4
.github/pull_request_template.md
vendored
4
.github/pull_request_template.md
vendored
@@ -3,8 +3,12 @@
|
||||
## Issue ticket number and link
|
||||
|
||||
## Checklist before requesting a review
|
||||
|
||||
- [ ] I have performed a self-review of my code.
|
||||
- [ ] If it is a core feature, I have added thorough tests.
|
||||
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
|
||||
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
|
||||
|
||||
## Checklist before merging
|
||||
|
||||
- [ ] Do not forget to reformat commit message to not include the above checklist
|
||||
|
||||
27
.github/workflows/build_and_test.yml
vendored
27
.github/workflows/build_and_test.yml
vendored
@@ -184,10 +184,10 @@ jobs:
|
||||
CARGO_FEATURES="--features testing"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FLAGS="--locked $CARGO_FEATURES"
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
|
||||
CARGO_FLAGS="--locked --release"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
|
||||
@@ -240,11 +240,18 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
|
||||
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
${cov_prefix} cargo test $CARGO_FLAGS
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES
|
||||
|
||||
# Run separate tests for real S3
|
||||
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
|
||||
export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
|
||||
export REMOTE_STORAGE_S3_REGION=eu-central-1
|
||||
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
@@ -268,7 +275,7 @@ jobs:
|
||||
mkdir -p /tmp/neon/test_bin/
|
||||
|
||||
test_exe_paths=$(
|
||||
${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
|
||||
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
for bin in $test_exe_paths; do
|
||||
@@ -891,6 +898,16 @@ jobs:
|
||||
needs: [ push-docker-hub, tag, regress-tests ]
|
||||
if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
|
||||
steps:
|
||||
- name: Fix git ownership
|
||||
run: |
|
||||
# Workaround for `fatal: detected dubious ownership in repository at ...`
|
||||
#
|
||||
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
|
||||
# Ref https://github.com/actions/checkout/issues/785
|
||||
#
|
||||
git config --global --add safe.directory ${{ github.workspace }}
|
||||
git config --global --add safe.directory ${GITHUB_WORKSPACE}
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
|
||||
4
.github/workflows/neon_extra_builds.yml
vendored
4
.github/workflows/neon_extra_builds.yml
vendored
@@ -53,14 +53,14 @@ jobs:
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v14
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Cache postgres v15 build
|
||||
id: cache_pg_15
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: pg_install/v15
|
||||
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
run: |
|
||||
|
||||
22
Cargo.lock
generated
22
Cargo.lock
generated
@@ -3086,6 +3086,7 @@ dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"test-context",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
@@ -3889,6 +3890,27 @@ dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-context"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"futures",
|
||||
"test-context-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "test-context-macros"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.16.0"
|
||||
|
||||
@@ -97,6 +97,7 @@ strum_macros = "0.24"
|
||||
svg_fmt = "0.4.1"
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
test-context = "0.1"
|
||||
thiserror = "1.0"
|
||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
|
||||
@@ -40,6 +40,8 @@ pacman -S base-devel readline zlib libseccomp openssl clang \
|
||||
postgresql-libs cmake postgresql protobuf
|
||||
```
|
||||
|
||||
Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases).
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
|
||||
@@ -203,13 +203,14 @@ fn main() -> Result<()> {
|
||||
if delay_exit {
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
info!("shutting down");
|
||||
}
|
||||
|
||||
info!("shutting down tracing");
|
||||
// Shutdown trace pipeline gracefully, so that it has a chance to send any
|
||||
// pending traces before we exit.
|
||||
tracing_utils::shutdown_tracing();
|
||||
|
||||
info!("shutting down");
|
||||
exit(exit_code.unwrap_or(1))
|
||||
}
|
||||
|
||||
|
||||
@@ -74,18 +74,9 @@ impl GenericOption {
|
||||
/// Represent `GenericOption` as configuration option.
|
||||
pub fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
// TODO: check in the console DB that we don't have these settings
|
||||
// set for any non-deleted project and drop this override.
|
||||
let name = match self.name.as_str() {
|
||||
"safekeepers" => "neon.safekeepers",
|
||||
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
|
||||
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
|
||||
it => it,
|
||||
};
|
||||
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} = '{}'", name, escape_conf_value(val)),
|
||||
_ => format!("{} = {}", name, val),
|
||||
"string" => format!("{} = '{}'", self.name, escape_conf_value(val)),
|
||||
_ => format!("{} = {}", self.name, val),
|
||||
}
|
||||
} else {
|
||||
self.name.to_owned()
|
||||
|
||||
@@ -90,7 +90,6 @@ impl ComputeControlPlane {
|
||||
timeline_id,
|
||||
lsn,
|
||||
tenant_id,
|
||||
uses_wal_proposer: false,
|
||||
pg_version,
|
||||
});
|
||||
|
||||
@@ -115,7 +114,6 @@ pub struct PostgresNode {
|
||||
pub timeline_id: TimelineId,
|
||||
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
|
||||
pub tenant_id: TenantId,
|
||||
uses_wal_proposer: bool,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
@@ -149,7 +147,6 @@ impl PostgresNode {
|
||||
let port: u16 = conf.parse_field("port", &context)?;
|
||||
let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?;
|
||||
let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
|
||||
let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
|
||||
|
||||
// Read postgres version from PG_VERSION file to determine which postgres version binary to use.
|
||||
// If it doesn't exist, assume broken data directory and use default pg version.
|
||||
@@ -172,7 +169,6 @@ impl PostgresNode {
|
||||
timeline_id,
|
||||
lsn: recovery_target_lsn,
|
||||
tenant_id,
|
||||
uses_wal_proposer,
|
||||
pg_version,
|
||||
})
|
||||
}
|
||||
@@ -364,7 +360,7 @@ impl PostgresNode {
|
||||
fn load_basebackup(&self, auth_token: &Option<String>) -> Result<()> {
|
||||
let backup_lsn = if let Some(lsn) = self.lsn {
|
||||
Some(lsn)
|
||||
} else if self.uses_wal_proposer {
|
||||
} else if !self.env.safekeepers.is_empty() {
|
||||
// LSN 0 means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
@@ -403,7 +399,7 @@ impl PostgresNode {
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir(self.pg_version)?.join("pg_ctl");
|
||||
let mut cmd = Command::new(pg_ctl_path);
|
||||
let mut cmd = Command::new(&pg_ctl_path);
|
||||
cmd.args(
|
||||
[
|
||||
&[
|
||||
@@ -432,7 +428,9 @@ impl PostgresNode {
|
||||
cmd.env("NEON_AUTH_TOKEN", token);
|
||||
}
|
||||
|
||||
let pg_ctl = cmd.output().context("pg_ctl failed")?;
|
||||
let pg_ctl = cmd
|
||||
.output()
|
||||
.context(format!("{} failed", pg_ctl_path.display()))?;
|
||||
if !pg_ctl.status.success() {
|
||||
anyhow::bail!(
|
||||
"pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
|
||||
|
||||
@@ -156,7 +156,7 @@ impl SafekeeperNode {
|
||||
}
|
||||
|
||||
background_process::start_process(
|
||||
&format!("safekeeper {id}"),
|
||||
&format!("safekeeper-{id}"),
|
||||
&datadir,
|
||||
&self.env.safekeeper_bin(),
|
||||
&args,
|
||||
|
||||
@@ -936,35 +936,40 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
// Neon extension of postgres replication protocol
|
||||
// See NEON_STATUS_UPDATE_TAG_BYTE
|
||||
/// Feedback pageserver sends to safekeeper and safekeeper resends to compute.
|
||||
/// Serialized in custom flexible key/value format. In replication protocol, it
|
||||
/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres
|
||||
/// Standby status update / Hot standby feedback messages.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct ReplicationFeedback {
|
||||
// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub struct PageserverFeedback {
|
||||
/// Last known size of the timeline. Used to enforce timeline size limit.
|
||||
pub current_timeline_size: u64,
|
||||
// Parts of StandbyStatusUpdate we resend to compute via safekeeper
|
||||
pub ps_writelsn: u64,
|
||||
pub ps_applylsn: u64,
|
||||
pub ps_flushlsn: u64,
|
||||
pub ps_replytime: SystemTime,
|
||||
/// LSN last received and ingested by the pageserver.
|
||||
pub last_received_lsn: u64,
|
||||
/// LSN up to which data is persisted by the pageserver to its local disc.
|
||||
pub disk_consistent_lsn: u64,
|
||||
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
|
||||
/// consider WAL before it can be removed.
|
||||
pub remote_consistent_lsn: u64,
|
||||
pub replytime: SystemTime,
|
||||
}
|
||||
|
||||
// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
|
||||
// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
|
||||
// Do not remove previously available fields because this might be backwards incompatible.
|
||||
pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
|
||||
|
||||
impl ReplicationFeedback {
|
||||
pub fn empty() -> ReplicationFeedback {
|
||||
ReplicationFeedback {
|
||||
impl PageserverFeedback {
|
||||
pub fn empty() -> PageserverFeedback {
|
||||
PageserverFeedback {
|
||||
current_timeline_size: 0,
|
||||
ps_writelsn: 0,
|
||||
ps_applylsn: 0,
|
||||
ps_flushlsn: 0,
|
||||
ps_replytime: SystemTime::now(),
|
||||
last_received_lsn: 0,
|
||||
remote_consistent_lsn: 0,
|
||||
disk_consistent_lsn: 0,
|
||||
replytime: SystemTime::now(),
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize ReplicationFeedback using custom format
|
||||
// Serialize PageserverFeedback using custom format
|
||||
// to support protocol extensibility.
|
||||
//
|
||||
// Following layout is used:
|
||||
@@ -974,24 +979,26 @@ impl ReplicationFeedback {
|
||||
// null-terminated string - key,
|
||||
// uint32 - value length in bytes
|
||||
// value itself
|
||||
//
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn serialize(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.current_timeline_size);
|
||||
|
||||
buf.put_slice(b"ps_writelsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.ps_writelsn);
|
||||
buf.put_u64(self.last_received_lsn);
|
||||
buf.put_slice(b"ps_flushlsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.ps_flushlsn);
|
||||
buf.put_u64(self.disk_consistent_lsn);
|
||||
buf.put_slice(b"ps_applylsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.ps_applylsn);
|
||||
buf.put_u64(self.remote_consistent_lsn);
|
||||
|
||||
let timestamp = self
|
||||
.ps_replytime
|
||||
.replytime
|
||||
.duration_since(*PG_EPOCH)
|
||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||
.as_micros() as i64;
|
||||
@@ -1001,9 +1008,10 @@ impl ReplicationFeedback {
|
||||
buf.put_i64(timestamp);
|
||||
}
|
||||
|
||||
// Deserialize ReplicationFeedback message
|
||||
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
// Deserialize PageserverFeedback message
|
||||
// TODO: change serialized fields names once all computes migrate to rename.
|
||||
pub fn parse(mut buf: Bytes) -> PageserverFeedback {
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
let nfields = buf.get_u8();
|
||||
for _ in 0..nfields {
|
||||
let key = read_cstr(&mut buf).unwrap();
|
||||
@@ -1016,39 +1024,39 @@ impl ReplicationFeedback {
|
||||
b"ps_writelsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.ps_writelsn = buf.get_u64();
|
||||
rf.last_received_lsn = buf.get_u64();
|
||||
}
|
||||
b"ps_flushlsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.ps_flushlsn = buf.get_u64();
|
||||
rf.disk_consistent_lsn = buf.get_u64();
|
||||
}
|
||||
b"ps_applylsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
rf.ps_applylsn = buf.get_u64();
|
||||
rf.remote_consistent_lsn = buf.get_u64();
|
||||
}
|
||||
b"ps_replytime" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
let raw_time = buf.get_i64();
|
||||
if raw_time > 0 {
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
} else {
|
||||
rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
"ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
|
||||
"PageserverFeedback parse. unknown key {} of len {len}. Skip it.",
|
||||
String::from_utf8_lossy(key.as_ref())
|
||||
);
|
||||
buf.advance(len as usize);
|
||||
}
|
||||
}
|
||||
}
|
||||
trace!("ReplicationFeedback parsed is {:?}", rf);
|
||||
trace!("PageserverFeedback parsed is {:?}", rf);
|
||||
rf
|
||||
}
|
||||
}
|
||||
@@ -1059,33 +1067,33 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_serialization() {
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
let rf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
let mut rf = PageserverFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
rf.serialize(&mut data);
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
if let Some(first) = data.first_mut() {
|
||||
*first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
*first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
}
|
||||
|
||||
data.put_slice(b"new_field_one\0");
|
||||
@@ -1093,7 +1101,7 @@ mod tests {
|
||||
data.put_u64(42);
|
||||
|
||||
// Parse serialized data and check that new field is not parsed
|
||||
let rf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
let rf_parsed = PageserverFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
|
||||
@@ -26,3 +26,4 @@ workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile.workspace = true
|
||||
test-context.workspace = true
|
||||
|
||||
@@ -39,6 +39,9 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
/// No limits on the client side, which currenltly means 1000 for AWS S3.
|
||||
/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax
|
||||
pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
|
||||
|
||||
const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
|
||||
|
||||
@@ -64,6 +67,10 @@ impl RemotePath {
|
||||
pub fn object_name(&self) -> Option<&str> {
|
||||
self.0.file_name().and_then(|os_str| os_str.to_str())
|
||||
}
|
||||
|
||||
pub fn join(&self, segment: &Path) -> Self {
|
||||
Self(self.0.join(segment))
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
@@ -266,6 +273,7 @@ pub struct S3Config {
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
pub max_keys_per_list_response: Option<i32>,
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
@@ -275,6 +283,10 @@ impl Debug for S3Config {
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.field(
|
||||
"max_keys_per_list_response",
|
||||
&self.max_keys_per_list_response,
|
||||
)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -303,6 +315,11 @@ impl RemoteStorageConfig {
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
|
||||
let max_keys_per_list_response =
|
||||
parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
|
||||
.context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
|
||||
.or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
|
||||
|
||||
let storage = match (local_path, bucket_name, bucket_region) {
|
||||
// no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
|
||||
(None, None, None) => return Ok(None),
|
||||
@@ -324,6 +341,7 @@ impl RemoteStorageConfig {
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
max_keys_per_list_response,
|
||||
}),
|
||||
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
|
||||
parse_toml_string("local_path", local_path)?,
|
||||
|
||||
@@ -102,6 +102,7 @@ pub struct S3Bucket {
|
||||
client: Client,
|
||||
bucket_name: String,
|
||||
prefix_in_bucket: Option<String>,
|
||||
max_keys_per_list_response: Option<i32>,
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
@@ -164,6 +165,7 @@ impl S3Bucket {
|
||||
Ok(Self {
|
||||
client,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
max_keys_per_list_response: aws_config.max_keys_per_list_response,
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
|
||||
})
|
||||
@@ -291,7 +293,9 @@ impl RemoteStorage for S3Bucket {
|
||||
.list_objects_v2()
|
||||
.bucket(self.bucket_name.clone())
|
||||
.set_prefix(self.prefix_in_bucket.clone())
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
|
||||
.set_continuation_token(continuation_token)
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
@@ -306,7 +310,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))),
|
||||
);
|
||||
|
||||
match fetch_response.continuation_token {
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
@@ -354,6 +358,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.set_prefix(list_prefix.clone())
|
||||
.set_continuation_token(continuation_token)
|
||||
.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string())
|
||||
.set_max_keys(self.max_keys_per_list_response)
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
@@ -371,7 +376,7 @@ impl RemoteStorage for S3Bucket {
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
|
||||
match fetch_response.continuation_token {
|
||||
match fetch_response.next_continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
|
||||
275
libs/remote_storage/tests/pagination_tests.rs
Normal file
275
libs/remote_storage/tests/pagination_tests.rs
Normal file
@@ -0,0 +1,275 @@
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::ops::ControlFlow;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::UNIX_EPOCH;
|
||||
|
||||
use anyhow::Context;
|
||||
use remote_storage::{
|
||||
GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::{test_context, AsyncTestContext};
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
|
||||
|
||||
/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
|
||||
/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
|
||||
/// See the client creation in [`create_s3_client`] for details on the required env vars.
|
||||
/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
|
||||
/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
|
||||
///
|
||||
/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`]
|
||||
/// where
|
||||
/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
|
||||
/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
|
||||
///
|
||||
/// Then, verifies that the client does return correct prefixes when queried:
|
||||
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
|
||||
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
|
||||
///
|
||||
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
|
||||
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
|
||||
/// since current default AWS S3 pagination limit is 1000.
|
||||
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
|
||||
///
|
||||
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
|
||||
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
|
||||
#[test_context(MaybeEnabledS3)]
|
||||
#[tokio::test]
|
||||
async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledS3::Enabled(ctx) => ctx,
|
||||
MaybeEnabledS3::Disabled => return Ok(()),
|
||||
MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
|
||||
};
|
||||
|
||||
let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
|
||||
let expected_remote_prefixes = ctx.remote_prefixes.clone();
|
||||
|
||||
let base_prefix =
|
||||
RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_remote_prefixes, HashSet::from([base_prefix.clone()]),
|
||||
"remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix))
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
.difference(&expected_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
let missing_uploaded_prefixes = expected_remote_prefixes
|
||||
.difference(&nested_remote_prefixes)
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
|
||||
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
enum MaybeEnabledS3 {
|
||||
Enabled(S3WithTestBlobs),
|
||||
Disabled,
|
||||
UploadsFailed(anyhow::Error, S3WithTestBlobs),
|
||||
}
|
||||
|
||||
struct S3WithTestBlobs {
|
||||
client_with_excessive_pagination: Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
remote_prefixes: HashSet<RemotePath>,
|
||||
remote_blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl AsyncTestContext for MaybeEnabledS3 {
|
||||
async fn setup() -> Self {
|
||||
utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed");
|
||||
if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
|
||||
info!(
|
||||
"`{}` env variable is not set, skipping the test",
|
||||
ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
|
||||
);
|
||||
return Self::Disabled;
|
||||
}
|
||||
|
||||
let max_keys_in_list_response = 10;
|
||||
let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
|
||||
|
||||
let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
|
||||
.context("S3 client creation")
|
||||
.expect("S3 client creation failed");
|
||||
|
||||
let base_prefix_str = "test/";
|
||||
match upload_s3_data(
|
||||
&client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
upload_tasks_count,
|
||||
)
|
||||
.await
|
||||
{
|
||||
ControlFlow::Continue(uploads) => {
|
||||
info!("Remote objects created successfully");
|
||||
Self::Enabled(S3WithTestBlobs {
|
||||
client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
})
|
||||
}
|
||||
ControlFlow::Break(uploads) => Self::UploadsFailed(
|
||||
anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
|
||||
S3WithTestBlobs {
|
||||
client_with_excessive_pagination,
|
||||
base_prefix_str,
|
||||
remote_prefixes: uploads.prefixes,
|
||||
remote_blobs: uploads.blobs,
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
async fn teardown(self) {
|
||||
match self {
|
||||
Self::Disabled => {}
|
||||
Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
|
||||
cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
|
||||
let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
|
||||
.context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
|
||||
let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
|
||||
.context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?;
|
||||
let random_prefix_part = std::time::SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.context("random s3 test prefix part calculation")?
|
||||
.as_millis();
|
||||
let remote_storage_config = RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(5).unwrap(),
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: remote_storage_s3_bucket,
|
||||
bucket_region: remote_storage_s3_region,
|
||||
prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
|
||||
endpoint: None,
|
||||
concurrency_limit: NonZeroUsize::new(100).unwrap(),
|
||||
max_keys_per_list_response: Some(max_keys_per_list_response),
|
||||
}),
|
||||
};
|
||||
Ok(Arc::new(
|
||||
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
|
||||
))
|
||||
}
|
||||
|
||||
struct Uploads {
|
||||
prefixes: HashSet<RemotePath>,
|
||||
blobs: HashSet<RemotePath>,
|
||||
}
|
||||
|
||||
async fn upload_s3_data(
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
base_prefix_str: &'static str,
|
||||
upload_tasks_count: usize,
|
||||
) -> ControlFlow<Uploads, Uploads> {
|
||||
info!("Creating {upload_tasks_count} S3 files");
|
||||
let mut upload_tasks = JoinSet::new();
|
||||
for i in 1..upload_tasks_count + 1 {
|
||||
let task_client = Arc::clone(client);
|
||||
upload_tasks.spawn(async move {
|
||||
let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
|
||||
let blob_prefix = RemotePath::new(&prefix)
|
||||
.with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
|
||||
let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
|
||||
debug!("Creating remote item {i} at path {blob_path:?}");
|
||||
|
||||
let data = format!("remote blob data {i}").into_bytes();
|
||||
let data_len = data.len();
|
||||
task_client
|
||||
.upload(
|
||||
Box::new(std::io::Cursor::new(data)),
|
||||
data_len,
|
||||
&blob_path,
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok::<_, anyhow::Error>((blob_prefix, blob_path))
|
||||
});
|
||||
}
|
||||
|
||||
let mut upload_tasks_failed = false;
|
||||
let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count);
|
||||
let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count);
|
||||
while let Some(task_run_result) = upload_tasks.join_next().await {
|
||||
match task_run_result
|
||||
.context("task join failed")
|
||||
.and_then(|task_result| task_result.context("upload task failed"))
|
||||
{
|
||||
Ok((upload_prefix, upload_path)) => {
|
||||
uploaded_prefixes.insert(upload_prefix);
|
||||
uploaded_blobs.insert(upload_path);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Upload task failed: {e:?}");
|
||||
upload_tasks_failed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let uploads = Uploads {
|
||||
prefixes: uploaded_prefixes,
|
||||
blobs: uploaded_blobs,
|
||||
};
|
||||
if upload_tasks_failed {
|
||||
ControlFlow::Break(uploads)
|
||||
} else {
|
||||
ControlFlow::Continue(uploads)
|
||||
}
|
||||
}
|
||||
|
||||
async fn cleanup(client: &Arc<GenericRemoteStorage>, objects_to_delete: HashSet<RemotePath>) {
|
||||
info!(
|
||||
"Removing {} objects from the remote storage during cleanup",
|
||||
objects_to_delete.len()
|
||||
);
|
||||
let mut delete_tasks = JoinSet::new();
|
||||
for object_to_delete in objects_to_delete {
|
||||
let task_client = Arc::clone(client);
|
||||
delete_tasks.spawn(async move {
|
||||
debug!("Deleting remote item at path {object_to_delete:?}");
|
||||
task_client
|
||||
.delete(&object_to_delete)
|
||||
.await
|
||||
.with_context(|| format!("{object_to_delete:?} removal"))
|
||||
});
|
||||
}
|
||||
|
||||
while let Some(task_run_result) = delete_tasks.join_next().await {
|
||||
match task_run_result {
|
||||
Ok(task_result) => match task_result {
|
||||
Ok(()) => {}
|
||||
Err(e) => error!("Delete task failed: {e:?}"),
|
||||
},
|
||||
Err(join_err) => error!("Delete task did not finish correctly: {join_err}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,6 +20,9 @@ pub enum ApiError {
|
||||
#[error("Conflict: {0}")]
|
||||
Conflict(String),
|
||||
|
||||
#[error("Precondition failed: {0}")]
|
||||
PreconditionFailed(&'static str),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
@@ -44,6 +47,10 @@ impl ApiError {
|
||||
ApiError::Conflict(_) => {
|
||||
HttpErrorBody::response_from_msg_and_status(self.to_string(), StatusCode::CONFLICT)
|
||||
}
|
||||
ApiError::PreconditionFailed(_) => HttpErrorBody::response_from_msg_and_status(
|
||||
self.to_string(),
|
||||
StatusCode::PRECONDITION_FAILED,
|
||||
),
|
||||
ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
err.to_string(),
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
|
||||
@@ -1,25 +1,7 @@
|
||||
use signal_hook::flag;
|
||||
use signal_hook::iterator::Signals;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub use signal_hook::consts::{signal::*, TERM_SIGNALS};
|
||||
|
||||
pub fn install_shutdown_handlers() -> anyhow::Result<ShutdownSignals> {
|
||||
let term_now = Arc::new(AtomicBool::new(false));
|
||||
for sig in TERM_SIGNALS {
|
||||
// When terminated by a second term signal, exit with exit code 1.
|
||||
// This will do nothing the first time (because term_now is false).
|
||||
flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
|
||||
// But this will "arm" the above for the second time, by setting it to true.
|
||||
// The order of registering these is important, if you put this one first, it will
|
||||
// first arm and then terminate ‒ all in the first round.
|
||||
flag::register(*sig, Arc::clone(&term_now))?;
|
||||
}
|
||||
|
||||
Ok(ShutdownSignals)
|
||||
}
|
||||
|
||||
pub enum Signal {
|
||||
Quit,
|
||||
Interrupt,
|
||||
@@ -39,10 +21,7 @@ impl Signal {
|
||||
pub struct ShutdownSignals;
|
||||
|
||||
impl ShutdownSignals {
|
||||
pub fn handle(
|
||||
self,
|
||||
mut handler: impl FnMut(Signal) -> anyhow::Result<()>,
|
||||
) -> anyhow::Result<()> {
|
||||
pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> {
|
||||
for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() {
|
||||
let signal = match raw_signal {
|
||||
SIGINT => Signal::Interrupt,
|
||||
|
||||
@@ -25,11 +25,9 @@ use pageserver::{
|
||||
virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use utils::signals::ShutdownSignals;
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
logging, project_git_version,
|
||||
sentry_init::init_sentry,
|
||||
signals::{self, Signal},
|
||||
auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal,
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
@@ -264,9 +262,6 @@ fn start_pageserver(
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
// Install signal handlers
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// Launch broker client
|
||||
WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
|
||||
|
||||
@@ -430,7 +425,7 @@ fn start_pageserver(
|
||||
}
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
signals.handle(|signal| match signal {
|
||||
ShutdownSignals::handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
info!(
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
|
||||
@@ -170,6 +170,10 @@ pub struct PageServerConf {
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`] allowed.
|
||||
pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
|
||||
/// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
|
||||
/// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
|
||||
/// See the comment in `eviction_task` for details.
|
||||
pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,
|
||||
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
pub metric_collection_interval: Duration,
|
||||
@@ -246,7 +250,7 @@ struct PageServerConfigBuilder {
|
||||
|
||||
log_format: BuilderValue<LogFormat>,
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<ConfigurableSemaphore>,
|
||||
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
|
||||
|
||||
metric_collection_interval: BuilderValue<Duration>,
|
||||
cached_metric_collection_interval: BuilderValue<Duration>,
|
||||
@@ -295,7 +299,9 @@ impl Default for PageServerConfigBuilder {
|
||||
.expect("cannot parse default keepalive interval")),
|
||||
log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()),
|
||||
|
||||
concurrent_tenant_size_logical_size_queries: Set(ConfigurableSemaphore::default()),
|
||||
concurrent_tenant_size_logical_size_queries: Set(
|
||||
ConfigurableSemaphore::DEFAULT_INITIAL,
|
||||
),
|
||||
metric_collection_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
@@ -400,7 +406,7 @@ impl PageServerConfigBuilder {
|
||||
self.log_format = BuilderValue::Set(log_format)
|
||||
}
|
||||
|
||||
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: ConfigurableSemaphore) {
|
||||
pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) {
|
||||
self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u);
|
||||
}
|
||||
|
||||
@@ -449,6 +455,11 @@ impl PageServerConfigBuilder {
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let concurrent_tenant_size_logical_size_queries = self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?;
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
@@ -496,11 +507,12 @@ impl PageServerConfigBuilder {
|
||||
.broker_keepalive_interval
|
||||
.ok_or(anyhow!("No broker keepalive interval provided"))?,
|
||||
log_format: self.log_format.ok_or(anyhow!("missing log_format"))?,
|
||||
concurrent_tenant_size_logical_size_queries: self
|
||||
.concurrent_tenant_size_logical_size_queries
|
||||
.ok_or(anyhow!(
|
||||
"missing concurrent_tenant_size_logical_size_queries"
|
||||
))?,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new(
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
),
|
||||
metric_collection_interval: self
|
||||
.metric_collection_interval
|
||||
.ok_or(anyhow!("missing metric_collection_interval"))?,
|
||||
@@ -698,8 +710,7 @@ impl PageServerConf {
|
||||
"concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({
|
||||
let input = parse_toml_string(key, item)?;
|
||||
let permits = input.parse::<usize>().context("expected a number of initial permits, not {s:?}")?;
|
||||
let permits = NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?;
|
||||
ConfigurableSemaphore::new(permits)
|
||||
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
|
||||
}),
|
||||
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
"cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
|
||||
@@ -860,6 +871,8 @@ impl PageServerConf {
|
||||
broker_keepalive_interval: Duration::from_secs(5000),
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
|
||||
),
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
cached_metric_collection_interval: Duration::from_secs(60 * 60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
@@ -953,6 +966,11 @@ impl ConfigurableSemaphore {
|
||||
inner: std::sync::Arc::new(tokio::sync::Semaphore::new(initial_permits.get())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the configured amount of permits.
|
||||
pub fn initial_permits(&self) -> NonZeroUsize {
|
||||
self.initial_permits
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConfigurableSemaphore {
|
||||
@@ -1057,6 +1075,8 @@ log_format = 'json'
|
||||
)?,
|
||||
log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(),
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
@@ -1118,6 +1138,8 @@ log_format = 'json'
|
||||
broker_keepalive_interval: Duration::from_secs(5),
|
||||
log_format: LogFormat::Json,
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
eviction_task_immitated_concurrent_logical_size_queries:
|
||||
ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
cached_metric_collection_interval: Duration::from_secs(22200),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
@@ -1250,6 +1272,7 @@ broker_endpoint = '{broker_endpoint}'
|
||||
prefix_in_bucket: Some(prefix_in_bucket.clone()),
|
||||
endpoint: Some(endpoint.clone()),
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
max_keys_per_list_response: None,
|
||||
}),
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
|
||||
@@ -22,7 +22,8 @@
|
||||
//! If the actual usage is higher, the threshold is exceeded.
|
||||
//! `min_avail_bytes` is the absolute available space in bytes.
|
||||
//! If the actual usage is lower, the threshold is exceeded.
|
||||
//!
|
||||
//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction
|
||||
//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again.
|
||||
//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant.
|
||||
//! The reservation is to keep the most recently accessed X bytes per tenant resident.
|
||||
//! If we cannot relieve pressure by evicting layers outside of the reservation, we
|
||||
@@ -34,7 +35,11 @@
|
||||
//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress
|
||||
//! during page reconstruction.
|
||||
//! An alternative default for all tenants can be specified in the `tenant_config` section of the config.
|
||||
//! Lastly, each tenant can have an override in their respectice tenant config (`min_resident_size_override`).
|
||||
//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`).
|
||||
|
||||
// Implementation notes:
|
||||
// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl
|
||||
// reading these fields. We use the Debug impl for semi-structured logging, though.
|
||||
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
@@ -224,6 +229,7 @@ pub enum IterationOutcome<U> {
|
||||
Finished(IterationOutcomeFinished<U>),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct IterationOutcomeFinished<U> {
|
||||
/// The actual usage observed before we started the iteration.
|
||||
@@ -238,6 +244,7 @@ pub struct IterationOutcomeFinished<U> {
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[allow(dead_code)]
|
||||
struct AssumedUsage<U> {
|
||||
/// The expected value for `after`, after phase 2.
|
||||
projected_after: U,
|
||||
@@ -245,12 +252,14 @@ struct AssumedUsage<U> {
|
||||
failed: LayerCount,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Serialize)]
|
||||
struct PlannedUsage<U> {
|
||||
respecting_tenant_min_resident_size: U,
|
||||
fallback_to_global_lru: Option<U>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default, Serialize)]
|
||||
struct LayerCount {
|
||||
file_sizes: u64,
|
||||
@@ -608,6 +617,7 @@ mod filesystem_level_usage {
|
||||
use super::DiskUsageEvictionTaskConfig;
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
#[allow(dead_code)]
|
||||
pub struct Usage<'a> {
|
||||
config: &'a DiskUsageEvictionTaskConfig,
|
||||
|
||||
|
||||
@@ -214,6 +214,13 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"412":
|
||||
description: Tenant is missing
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
@@ -891,13 +898,9 @@ components:
|
||||
type: object
|
||||
properties:
|
||||
tenant_specific_overrides:
|
||||
type: object
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
effective_config:
|
||||
type: object
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
TimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
@@ -983,6 +986,13 @@ components:
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
PreconditionFailedError:
|
||||
type: object
|
||||
required:
|
||||
- msg
|
||||
properties:
|
||||
msg:
|
||||
type: string
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
|
||||
@@ -152,6 +152,11 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
|
||||
fn from(value: crate::tenant::mgr::DeleteTimelineError) -> Self {
|
||||
use crate::tenant::mgr::DeleteTimelineError::*;
|
||||
match value {
|
||||
// Report Precondition failed so client can distinguish between
|
||||
// "tenant is missing" case from "timeline is missing"
|
||||
Tenant(TenantStateError::NotFound(..)) => {
|
||||
ApiError::PreconditionFailed("Requested tenant is missing")
|
||||
}
|
||||
Tenant(t) => ApiError::from(t),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
}
|
||||
|
||||
@@ -257,7 +257,7 @@ impl EvictionsWithLowResidenceDuration {
|
||||
}
|
||||
|
||||
pub fn observe(&self, observed_value: Duration) {
|
||||
if self.threshold < observed_value {
|
||||
if observed_value < self.threshold {
|
||||
self.counter
|
||||
.as_ref()
|
||||
.expect("nobody calls this function after `remove_from_vec`")
|
||||
|
||||
@@ -27,6 +27,7 @@ use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::pin::pin;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
@@ -466,8 +467,7 @@ impl PageServerHandler {
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
let copyin_reader = StreamReader::new(copyin_stream(pgb));
|
||||
tokio::pin!(copyin_reader);
|
||||
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
|
||||
timeline
|
||||
.import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx)
|
||||
.await?;
|
||||
@@ -512,8 +512,7 @@ impl PageServerHandler {
|
||||
info!("importing wal");
|
||||
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
|
||||
pgb.flush().await?;
|
||||
let copyin_reader = StreamReader::new(copyin_stream(pgb));
|
||||
tokio::pin!(copyin_reader);
|
||||
let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
|
||||
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
|
||||
info!("wal import complete");
|
||||
|
||||
|
||||
@@ -46,6 +46,7 @@ use std::time::{Duration, Instant};
|
||||
use self::config::TenantConf;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::EvictionTaskTenantState;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::import_datadir;
|
||||
@@ -142,6 +143,8 @@ pub struct Tenant {
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
}
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
@@ -1788,6 +1791,7 @@ impl Tenant {
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ use std::sync::Arc;
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::sync::oneshot::error::RecvError;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
@@ -352,6 +353,10 @@ async fn fill_logical_sizes(
|
||||
// our advantage with `?` error handling.
|
||||
let mut joinset = tokio::task::JoinSet::new();
|
||||
|
||||
let cancel = tokio_util::sync::CancellationToken::new();
|
||||
// be sure to cancel all spawned tasks if we are dropped
|
||||
let _dg = cancel.clone().drop_guard();
|
||||
|
||||
// For each point that would benefit from having a logical size available,
|
||||
// spawn a Task to fetch it, unless we have it cached already.
|
||||
for seg in segments.iter() {
|
||||
@@ -373,6 +378,7 @@ async fn fill_logical_sizes(
|
||||
timeline,
|
||||
lsn,
|
||||
ctx,
|
||||
cancel.child_token(),
|
||||
));
|
||||
}
|
||||
e.insert(cached_size);
|
||||
@@ -477,13 +483,14 @@ async fn calculate_logical_size(
|
||||
timeline: Arc<crate::tenant::Timeline>,
|
||||
lsn: utils::lsn::Lsn,
|
||||
ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<TimelineAtLsnSizeResult, RecvError> {
|
||||
let _permit = tokio::sync::Semaphore::acquire_owned(limit)
|
||||
.await
|
||||
.expect("global semaphore should not had been closed");
|
||||
|
||||
let size_res = timeline
|
||||
.spawn_ondemand_logical_size_calculation(lsn, ctx)
|
||||
.spawn_ondemand_logical_size_calculation(lsn, ctx, cancel)
|
||||
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
|
||||
.await?;
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||
|
||||
@@ -25,6 +25,7 @@ use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::pin::pin;
|
||||
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
@@ -72,6 +73,7 @@ use crate::ZERO_PAGE;
|
||||
use crate::{is_temporary, task_mgr};
|
||||
use walreceiver::spawn_connection_manager_task;
|
||||
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
use self::eviction_task::EvictionTaskTimelineState;
|
||||
|
||||
use super::layer_map::BatchedUpdates;
|
||||
@@ -677,8 +679,7 @@ impl Timeline {
|
||||
|
||||
let mut failed = 0;
|
||||
|
||||
let cancelled = task_mgr::shutdown_watcher();
|
||||
tokio::pin!(cancelled);
|
||||
let mut cancelled = pin!(task_mgr::shutdown_watcher());
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -1126,6 +1127,9 @@ impl Timeline {
|
||||
self.metrics
|
||||
.evictions_with_low_residence_duration
|
||||
.observe(delta);
|
||||
info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period");
|
||||
} else {
|
||||
info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period");
|
||||
}
|
||||
|
||||
true
|
||||
@@ -1768,8 +1772,11 @@ impl Timeline {
|
||||
false,
|
||||
// NB: don't log errors here, task_mgr will do that.
|
||||
async move {
|
||||
// no cancellation here, because nothing really waits for this to complete compared
|
||||
// to spawn_ondemand_logical_size_calculation.
|
||||
let cancel = CancellationToken::new();
|
||||
let calculated_size = match self_clone
|
||||
.logical_size_calculation_task(lsn, &background_ctx)
|
||||
.logical_size_calculation_task(lsn, &background_ctx, cancel)
|
||||
.await
|
||||
{
|
||||
Ok(s) => s,
|
||||
@@ -1824,6 +1831,7 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
ctx: RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
|
||||
let (sender, receiver) = oneshot::channel();
|
||||
let self_clone = Arc::clone(self);
|
||||
@@ -1843,7 +1851,9 @@ impl Timeline {
|
||||
"ondemand logical size calculation",
|
||||
false,
|
||||
async move {
|
||||
let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
|
||||
let res = self_clone
|
||||
.logical_size_calculation_task(lsn, &ctx, cancel)
|
||||
.await;
|
||||
let _ = sender.send(res).ok();
|
||||
Ok(()) // Receiver is responsible for handling errors
|
||||
},
|
||||
@@ -1856,18 +1866,18 @@ impl Timeline {
|
||||
self: &Arc<Self>,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
let mut timeline_state_updates = self.subscribe_for_state_updates();
|
||||
let self_calculation = Arc::clone(self);
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let calculation = async {
|
||||
let mut calculation = pin!(async {
|
||||
let cancel = cancel.child_token();
|
||||
let ctx = ctx.attached_child();
|
||||
self_calculation
|
||||
.calculate_logical_size(lsn, cancel, &ctx)
|
||||
.await
|
||||
};
|
||||
});
|
||||
let timeline_state_cancellation = async {
|
||||
loop {
|
||||
match timeline_state_updates.changed().await {
|
||||
@@ -1896,7 +1906,6 @@ impl Timeline {
|
||||
"aborted because task_mgr shutdown requested".to_string()
|
||||
};
|
||||
|
||||
tokio::pin!(calculation);
|
||||
loop {
|
||||
tokio::select! {
|
||||
res = &mut calculation => { return res }
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
//!
|
||||
//! See write-up on restart on-demand download spike: <https://gist.github.com/problame/2265bf7b8dc398be834abfead36c76b5>
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
ops::ControlFlow,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -29,6 +30,7 @@ use crate::{
|
||||
tenant::{
|
||||
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
|
||||
storage_layer::PersistentLayer,
|
||||
Tenant,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -36,7 +38,12 @@ use super::Timeline;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EvictionTaskTimelineState {
|
||||
last_refresh_required_in_restart: Option<tokio::time::Instant>,
|
||||
last_layer_access_imitation: Option<tokio::time::Instant>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EvictionTaskTenantState {
|
||||
last_layer_access_imitation: Option<Instant>,
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -126,6 +133,35 @@ impl Timeline {
|
||||
) -> ControlFlow<()> {
|
||||
let now = SystemTime::now();
|
||||
|
||||
// If we evict layers but keep cached values derived from those layers, then
|
||||
// we face a storm of on-demand downloads after pageserver restart.
|
||||
// The reason is that the restart empties the caches, and so, the values
|
||||
// need to be re-computed by accessing layers, which we evicted while the
|
||||
// caches were filled.
|
||||
//
|
||||
// Solutions here would be one of the following:
|
||||
// 1. Have a persistent cache.
|
||||
// 2. Count every access to a cached value to the access stats of all layers
|
||||
// that were accessed to compute the value in the first place.
|
||||
// 3. Invalidate the caches at a period of < p.threshold/2, so that the values
|
||||
// get re-computed from layers, thereby counting towards layer access stats.
|
||||
// 4. Make the eviction task imitate the layer accesses that typically hit caches.
|
||||
//
|
||||
// We follow approach (4) here because in Neon prod deployment:
|
||||
// - page cache is quite small => high churn => low hit rate
|
||||
// => eviction gets correct access stats
|
||||
// - value-level caches such as logical size & repatition have a high hit rate,
|
||||
// especially for inactive tenants
|
||||
// => eviction sees zero accesses for these
|
||||
// => they cause the on-demand download storm on pageserver restart
|
||||
//
|
||||
// We should probably move to persistent caches in the future, or avoid
|
||||
// having inactive tenants attached to pageserver in the first place.
|
||||
match self.imitate_layer_accesses(p, cancel, ctx).await {
|
||||
ControlFlow::Break(()) => return ControlFlow::Break(()),
|
||||
ControlFlow::Continue(()) => (),
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
#[derive(Debug, Default)]
|
||||
struct EvictionStats {
|
||||
@@ -136,27 +172,6 @@ impl Timeline {
|
||||
skipped_for_shutdown: usize,
|
||||
}
|
||||
|
||||
// what we want is to invalidate any caches which haven't been accessed for `p.threshold`,
|
||||
// but we cannot actually do it for current limitations except by restarting pageserver. we
|
||||
// just recompute the values which would be recomputed on startup.
|
||||
//
|
||||
// for active tenants this will likely materialized page cache or in-memory layers. for
|
||||
// inactive tenants it will refresh the last_access timestamps so that we will not evict
|
||||
// and re-download on restart these layers.
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
match state.last_refresh_required_in_restart {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
_ => {
|
||||
self.refresh_layers_required_in_restart(cancel, ctx).await;
|
||||
state.last_refresh_required_in_restart = Some(tokio::time::Instant::now())
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
let mut stats = EvictionStats::default();
|
||||
// Gather layers for eviction.
|
||||
// NB: all the checks can be invalidated as soon as we release the layer map lock.
|
||||
@@ -254,8 +269,55 @@ impl Timeline {
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
async fn imitate_layer_accesses(
|
||||
&self,
|
||||
p: &EvictionPolicyLayerAccessThreshold,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
let mut state = self.eviction_task_timeline_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_timeline_cached_layer_accesses(cancel, ctx)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now())
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
// This task is timeline-scoped, but the synthetic size calculation is tenant-scoped.
|
||||
// Make one of the tenant's timelines draw the short straw and run the calculation.
|
||||
// The others wait until the calculation is done so that they take into account the
|
||||
// imitated accesses that the winner made.
|
||||
let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
|
||||
// likely, we're shutting down
|
||||
return ControlFlow::Break(());
|
||||
};
|
||||
let mut state = tenant.eviction_task_tenant_state.lock().await;
|
||||
match state.last_layer_access_imitation {
|
||||
Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ }
|
||||
_ => {
|
||||
self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
|
||||
.await;
|
||||
state.last_layer_access_imitation = Some(tokio::time::Instant::now());
|
||||
}
|
||||
}
|
||||
drop(state);
|
||||
|
||||
if cancel.is_cancelled() {
|
||||
return ControlFlow::Break(());
|
||||
}
|
||||
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
|
||||
/// Recompute the values which would cause on-demand downloads during restart.
|
||||
async fn refresh_layers_required_in_restart(
|
||||
async fn imitate_timeline_cached_layer_accesses(
|
||||
&self,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
@@ -289,4 +351,61 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Imitate the synthetic size calculation done by the consumption_metrics module.
|
||||
async fn imitate_synthetic_size_calculation_worker(
|
||||
&self,
|
||||
tenant: &Arc<Tenant>,
|
||||
ctx: &RequestContext,
|
||||
cancel: &CancellationToken,
|
||||
) {
|
||||
if self.conf.metric_collection_endpoint.is_none() {
|
||||
// We don't start the consumption metrics task if this is not set in the config.
|
||||
// So, no need to imitate the accesses in that case.
|
||||
return;
|
||||
}
|
||||
|
||||
// The consumption metrics are collected on a per-tenant basis, by a single
|
||||
// global background loop.
|
||||
// It limits the number of synthetic size calculations using the global
|
||||
// `concurrent_tenant_size_logical_size_queries` semaphore to not overload
|
||||
// the pageserver. (size calculation is somewhat expensive in terms of CPU and IOs).
|
||||
//
|
||||
// If we used that same semaphore here, then we'd compete for the
|
||||
// same permits, which may impact timeliness of consumption metrics.
|
||||
// That is a no-go, as consumption metrics are much more important
|
||||
// than what we do here.
|
||||
//
|
||||
// So, we have a separate semaphore, initialized to the same
|
||||
// number of permits as the `concurrent_tenant_size_logical_size_queries`.
|
||||
// In the worst, we would have twice the amount of concurrenct size calculations.
|
||||
// But in practice, the `p.threshold` >> `consumption metric interval`, and
|
||||
// we spread out the eviction task using `random_init_delay`.
|
||||
// So, the chance of the worst case is quite low in practice.
|
||||
// It runs as a per-tenant task, but the eviction_task.rs is per-timeline.
|
||||
// So, we must coordinate with other with other eviction tasks of this tenant.
|
||||
let limit = self
|
||||
.conf
|
||||
.eviction_task_immitated_concurrent_logical_size_queries
|
||||
.inner();
|
||||
|
||||
let mut throwaway_cache = HashMap::new();
|
||||
let gather =
|
||||
crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx);
|
||||
|
||||
tokio::select! {
|
||||
_ = cancel.cancelled() => {}
|
||||
gather_result = gather => {
|
||||
match gather_result {
|
||||
Ok(_) => {},
|
||||
Err(e) => {
|
||||
// We don't care about the result, but, if it failed, we should log it,
|
||||
// since consumption metric might be hitting the cached value and
|
||||
// thus not encountering this error.
|
||||
warn!("failed to imitate synthetic size calculation accesses: {e:#}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -237,11 +237,7 @@ async fn connection_manager_loop_step(
|
||||
if let Some(new_candidate) = walreceiver_state.next_connection_candidate() {
|
||||
info!("Switching to new connection candidate: {new_candidate:?}");
|
||||
walreceiver_state
|
||||
.change_connection(
|
||||
new_candidate.safekeeper_id,
|
||||
new_candidate.wal_source_connconf,
|
||||
ctx,
|
||||
)
|
||||
.change_connection(new_candidate, ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
@@ -346,6 +342,8 @@ struct WalConnection {
|
||||
started_at: NaiveDateTime,
|
||||
/// Current safekeeper pageserver is connected to for WAL streaming.
|
||||
sk_id: NodeId,
|
||||
/// Availability zone of the safekeeper.
|
||||
availability_zone: Option<String>,
|
||||
/// Status of the connection.
|
||||
status: WalConnectionStatus,
|
||||
/// WAL streaming task handle.
|
||||
@@ -405,12 +403,7 @@ impl WalreceiverState {
|
||||
}
|
||||
|
||||
/// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
|
||||
async fn change_connection(
|
||||
&mut self,
|
||||
new_sk_id: NodeId,
|
||||
new_wal_source_connconf: PgConnectionConfig,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
|
||||
self.drop_old_connection(true).await;
|
||||
|
||||
let id = self.id;
|
||||
@@ -424,7 +417,7 @@ impl WalreceiverState {
|
||||
async move {
|
||||
super::walreceiver_connection::handle_walreceiver_connection(
|
||||
timeline,
|
||||
new_wal_source_connconf,
|
||||
new_sk.wal_source_connconf,
|
||||
events_sender,
|
||||
cancellation,
|
||||
connect_timeout,
|
||||
@@ -433,13 +426,16 @@ impl WalreceiverState {
|
||||
.await
|
||||
.context("walreceiver connection handling failure")
|
||||
}
|
||||
.instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id))
|
||||
.instrument(
|
||||
info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id),
|
||||
)
|
||||
});
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
self.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: new_sk_id,
|
||||
sk_id: new_sk.safekeeper_id,
|
||||
availability_zone: new_sk.availability_zone,
|
||||
status: WalConnectionStatus {
|
||||
is_connected: false,
|
||||
has_processed_wal: false,
|
||||
@@ -546,6 +542,7 @@ impl WalreceiverState {
|
||||
/// * if connected safekeeper is not present, pick the candidate
|
||||
/// * if we haven't received any updates for some time, pick the candidate
|
||||
/// * if the candidate commit_lsn is much higher than the current one, pick the candidate
|
||||
/// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate
|
||||
/// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate
|
||||
///
|
||||
/// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently.
|
||||
@@ -559,6 +556,7 @@ impl WalreceiverState {
|
||||
|
||||
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
|
||||
self.select_connection_candidate(Some(connected_sk_node))?;
|
||||
let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone();
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
if let Ok(latest_interaciton) =
|
||||
@@ -569,6 +567,7 @@ impl WalreceiverState {
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
availability_zone: new_availability_zone,
|
||||
reason: ReconnectReason::NoKeepAlives {
|
||||
last_keep_alive: Some(
|
||||
existing_wal_connection.status.latest_connection_update,
|
||||
@@ -594,6 +593,7 @@ impl WalreceiverState {
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
availability_zone: new_availability_zone,
|
||||
reason: ReconnectReason::LaggingWal {
|
||||
current_commit_lsn,
|
||||
new_commit_lsn,
|
||||
@@ -601,6 +601,20 @@ impl WalreceiverState {
|
||||
},
|
||||
});
|
||||
}
|
||||
// If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver,
|
||||
// and the current one is not, switch to the new one.
|
||||
if self.availability_zone.is_some()
|
||||
&& existing_wal_connection.availability_zone
|
||||
!= self.availability_zone
|
||||
&& self.availability_zone == new_availability_zone
|
||||
{
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
availability_zone: new_availability_zone,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
reason: ReconnectReason::SwitchAvailabilityZone,
|
||||
});
|
||||
}
|
||||
}
|
||||
None => debug!(
|
||||
"Best SK candidate has its commit_lsn behind connected SK's commit_lsn"
|
||||
@@ -668,6 +682,7 @@ impl WalreceiverState {
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
availability_zone: new_availability_zone,
|
||||
reason: ReconnectReason::NoWalTimeout {
|
||||
current_lsn,
|
||||
current_commit_lsn,
|
||||
@@ -686,10 +701,11 @@ impl WalreceiverState {
|
||||
self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal;
|
||||
}
|
||||
None => {
|
||||
let (new_sk_id, _, new_wal_source_connconf) =
|
||||
let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) =
|
||||
self.select_connection_candidate(None)?;
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
availability_zone: new_safekeeper_broker_data.availability_zone.clone(),
|
||||
wal_source_connconf: new_wal_source_connconf,
|
||||
reason: ReconnectReason::NoExistingConnection,
|
||||
});
|
||||
@@ -794,6 +810,7 @@ impl WalreceiverState {
|
||||
struct NewWalConnectionCandidate {
|
||||
safekeeper_id: NodeId,
|
||||
wal_source_connconf: PgConnectionConfig,
|
||||
availability_zone: Option<String>,
|
||||
// This field is used in `derive(Debug)` only.
|
||||
#[allow(dead_code)]
|
||||
reason: ReconnectReason,
|
||||
@@ -808,6 +825,7 @@ enum ReconnectReason {
|
||||
new_commit_lsn: Lsn,
|
||||
threshold: NonZeroU64,
|
||||
},
|
||||
SwitchAvailabilityZone,
|
||||
NoWalTimeout {
|
||||
current_lsn: Lsn,
|
||||
current_commit_lsn: Lsn,
|
||||
@@ -873,6 +891,7 @@ mod tests {
|
||||
peer_horizon_lsn: 0,
|
||||
local_start_lsn: 0,
|
||||
safekeeper_connstr: safekeeper_connstr.to_owned(),
|
||||
availability_zone: None,
|
||||
},
|
||||
latest_update,
|
||||
}
|
||||
@@ -933,6 +952,7 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
@@ -1095,6 +1115,7 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
@@ -1160,6 +1181,7 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
@@ -1222,6 +1244,7 @@ mod tests {
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: NodeId(1),
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
|
||||
discovered_new_wal: Some(NewCommittedWAL {
|
||||
@@ -1289,4 +1312,74 @@ mod tests {
|
||||
availability_zone: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn switch_to_same_availability_zone() -> anyhow::Result<()> {
|
||||
// Pageserver and one of safekeepers will be in the same availability zone
|
||||
// and pageserver should prefer to connect to it.
|
||||
let test_az = Some("test_az".to_owned());
|
||||
|
||||
let harness = TenantHarness::create("switch_to_same_availability_zone")?;
|
||||
let mut state = dummy_state(&harness).await;
|
||||
state.availability_zone = test_az.clone();
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
let connected_sk_id = NodeId(0);
|
||||
|
||||
let connection_status = WalConnectionStatus {
|
||||
is_connected: true,
|
||||
has_processed_wal: true,
|
||||
latest_connection_update: now,
|
||||
latest_wal_update: now,
|
||||
commit_lsn: Some(current_lsn),
|
||||
streaming_lsn: Some(current_lsn),
|
||||
};
|
||||
|
||||
state.wal_connection = Some(WalConnection {
|
||||
started_at: now,
|
||||
sk_id: connected_sk_id,
|
||||
availability_zone: None,
|
||||
status: connection_status,
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskStateUpdate::Progress(connection_status))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
discovered_new_wal: None,
|
||||
});
|
||||
|
||||
// We have another safekeeper with the same commit_lsn, and it have the same availability zone as
|
||||
// the current pageserver.
|
||||
let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now);
|
||||
same_az_sk.timeline.availability_zone = test_az.clone();
|
||||
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
connected_sk_id,
|
||||
dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now),
|
||||
),
|
||||
(NodeId(1), same_az_sk),
|
||||
]);
|
||||
|
||||
// We expect that pageserver will switch to the safekeeper in the same availability zone,
|
||||
// even if it has the same commit_lsn.
|
||||
let next_candidate = state.next_connection_candidate().expect(
|
||||
"Expected one candidate selected out of multiple valid data options, but got none",
|
||||
);
|
||||
|
||||
assert_eq!(next_candidate.safekeeper_id, NodeId(1));
|
||||
assert_eq!(
|
||||
next_candidate.reason,
|
||||
ReconnectReason::SwitchAvailabilityZone,
|
||||
"Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
next_candidate.wal_source_connconf.host(),
|
||||
&Host::Domain("same_az".to_owned())
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
use std::{
|
||||
error::Error,
|
||||
pin::pin,
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
time::{Duration, SystemTime},
|
||||
@@ -17,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn;
|
||||
use postgres_ffi::WAL_SEGMENT_SIZE;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::{pin, select, sync::watch, time};
|
||||
use tokio::{select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
@@ -36,7 +37,7 @@ use crate::{
|
||||
use postgres_backend::is_expected_io_error;
|
||||
use postgres_connection::PgConnectionConfig;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use pq_proto::ReplicationFeedback;
|
||||
use pq_proto::PageserverFeedback;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
/// Status of the connection.
|
||||
@@ -187,8 +188,7 @@ pub async fn handle_walreceiver_connection(
|
||||
let query = format!("START_REPLICATION PHYSICAL {startpoint}");
|
||||
|
||||
let copy_stream = replication_client.copy_both_simple(&query).await?;
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
pin!(physical_stream);
|
||||
let mut physical_stream = pin!(ReplicationStream::new(copy_stream));
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
@@ -319,12 +319,12 @@ pub async fn handle_walreceiver_connection(
|
||||
timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0));
|
||||
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let write_lsn = u64::from(last_lsn);
|
||||
let last_received_lsn = u64::from(last_lsn);
|
||||
// `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
|
||||
let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
|
||||
let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn());
|
||||
// The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
|
||||
// Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
|
||||
let apply_lsn = u64::from(timeline_remote_consistent_lsn);
|
||||
let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn);
|
||||
let ts = SystemTime::now();
|
||||
|
||||
// Update the status about what we just received. This is shown in the mgmt API.
|
||||
@@ -343,12 +343,12 @@ pub async fn handle_walreceiver_connection(
|
||||
let (timeline_logical_size, _) = timeline
|
||||
.get_current_logical_size(&ctx)
|
||||
.context("Status update creation failed to get current logical size")?;
|
||||
let status_update = ReplicationFeedback {
|
||||
let status_update = PageserverFeedback {
|
||||
current_timeline_size: timeline_logical_size,
|
||||
ps_writelsn: write_lsn,
|
||||
ps_flushlsn: flush_lsn,
|
||||
ps_applylsn: apply_lsn,
|
||||
ps_replytime: ts,
|
||||
last_received_lsn,
|
||||
disk_consistent_lsn,
|
||||
remote_consistent_lsn,
|
||||
replytime: ts,
|
||||
};
|
||||
|
||||
debug!("neon_status_update {status_update:?}");
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
*/
|
||||
|
||||
#include <sys/file.h>
|
||||
#include <sys/statvfs.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
@@ -34,6 +35,9 @@
|
||||
#include "storage/fd.h"
|
||||
#include "storage/pg_shmem.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/procsignal.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
|
||||
/*
|
||||
* Local file cache is used to temporary store relations pages in local file system.
|
||||
@@ -59,6 +63,9 @@
|
||||
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||
|
||||
#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
|
||||
#define MAX_DISK_WRITE_RATE 1000 /* MB/sec */
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
{
|
||||
BufferTag key;
|
||||
@@ -71,6 +78,7 @@ typedef struct FileCacheEntry
|
||||
typedef struct FileCacheControl
|
||||
{
|
||||
uint32 size; /* size of cache file in chunks */
|
||||
uint32 used; /* number of used chunks */
|
||||
dlist_head lru; /* double linked list for LRU replacement algorithm */
|
||||
} FileCacheControl;
|
||||
|
||||
@@ -79,12 +87,14 @@ static int lfc_desc;
|
||||
static LWLockId lfc_lock;
|
||||
static int lfc_max_size;
|
||||
static int lfc_size_limit;
|
||||
static int lfc_free_space_watermark;
|
||||
static char* lfc_path;
|
||||
static FileCacheControl* lfc_ctl;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
static shmem_request_hook_type prev_shmem_request_hook;
|
||||
#endif
|
||||
static int lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */
|
||||
|
||||
static void
|
||||
lfc_shmem_startup(void)
|
||||
@@ -112,6 +122,7 @@ lfc_shmem_startup(void)
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
lfc_ctl->size = 0;
|
||||
lfc_ctl->used = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
|
||||
/* Remove file cache on restart */
|
||||
@@ -165,7 +176,7 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
}
|
||||
}
|
||||
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
|
||||
while (new_size < lfc_ctl->size && !dlist_is_empty(&lfc_ctl->lru))
|
||||
while (new_size < lfc_ctl->used && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Shrink cache by throwing away least recently accessed chunks and returning their space to file system */
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
@@ -175,12 +186,86 @@ lfc_change_limit_hook(int newval, void *extra)
|
||||
elog(LOG, "Failed to punch hole in file: %m");
|
||||
#endif
|
||||
hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL);
|
||||
lfc_ctl->size -= 1;
|
||||
lfc_ctl->used -= 1;
|
||||
}
|
||||
elog(LOG, "set local file cache limit to %d", new_size);
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Local file system state monitor check available free space.
|
||||
* If it is lower than lfc_free_space_watermark then we shrink size of local cache
|
||||
* but throwing away least recently accessed chunks.
|
||||
* First time low space watermark is reached cache size is divided by two,
|
||||
* second time by four,... Finally we remove all chunks from local cache.
|
||||
*
|
||||
* Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
|
||||
* We only throw away cached chunks but do not prevent from filling cache by new chunks.
|
||||
*
|
||||
* Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
|
||||
* disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
|
||||
* Calling statvfs each second should not add any noticeable overhead.
|
||||
*/
|
||||
void
|
||||
FileCacheMonitorMain(Datum main_arg)
|
||||
{
|
||||
/*
|
||||
* Choose file system state monitor interval so that space can not be exosted
|
||||
* during this period but not longer than MAX_MONITOR_INTERVAL (10 sec)
|
||||
*/
|
||||
uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
|
||||
|
||||
/* Establish signal handlers. */
|
||||
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
|
||||
pqsignal(SIGHUP, SignalHandlerForConfigReload);
|
||||
pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
|
||||
BackgroundWorkerUnblockSignals();
|
||||
|
||||
/* Periodically dump buffers until terminated. */
|
||||
while (!ShutdownRequestPending)
|
||||
{
|
||||
if (lfc_size_limit != 0)
|
||||
{
|
||||
struct statvfs sfs;
|
||||
if (statvfs(lfc_path, &sfs) < 0)
|
||||
{
|
||||
elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
|
||||
{
|
||||
if (lfc_shrinking_factor < 31) {
|
||||
lfc_shrinking_factor += 1;
|
||||
}
|
||||
lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
|
||||
}
|
||||
else
|
||||
lfc_shrinking_factor = 0; /* reset to initial value */
|
||||
}
|
||||
}
|
||||
pg_usleep(monitor_interval);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lfc_register_free_space_monitor(void)
|
||||
{
|
||||
BackgroundWorker bgw;
|
||||
memset(&bgw, 0, sizeof(bgw));
|
||||
bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
|
||||
bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
|
||||
snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
|
||||
snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
|
||||
snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
|
||||
snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
|
||||
bgw.bgw_restart_time = 5;
|
||||
bgw.bgw_notify_pid = 0;
|
||||
bgw.bgw_main_arg = (Datum) 0;
|
||||
|
||||
RegisterBackgroundWorker(&bgw);
|
||||
}
|
||||
|
||||
void
|
||||
lfc_init(void)
|
||||
{
|
||||
@@ -217,6 +302,19 @@ lfc_init(void)
|
||||
lfc_change_limit_hook,
|
||||
NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.free_space_watermark",
|
||||
"Minimal free space in local file system after reaching which local file cache will be truncated",
|
||||
NULL,
|
||||
&lfc_free_space_watermark,
|
||||
1024, /* 1GB */
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_MB,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.file_cache_path",
|
||||
"Path to local file cache (can be raw device)",
|
||||
NULL,
|
||||
@@ -231,6 +329,9 @@ lfc_init(void)
|
||||
if (lfc_max_size == 0)
|
||||
return;
|
||||
|
||||
if (lfc_free_space_watermark != 0)
|
||||
lfc_register_free_space_monitor();
|
||||
|
||||
prev_shmem_startup_hook = shmem_startup_hook;
|
||||
shmem_startup_hook = lfc_shmem_startup;
|
||||
#if PG_VERSION_NUM>=150000
|
||||
@@ -380,7 +481,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
* there are should be very large number of concurrent IO operations and them are limited by max_connections,
|
||||
* we prefer not to complicate code and use second approach.
|
||||
*/
|
||||
if (lfc_ctl->size >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
|
||||
if (lfc_ctl->used >= SIZE_MB_TO_CHUNKS(lfc_size_limit) && !dlist_is_empty(&lfc_ctl->lru))
|
||||
{
|
||||
/* Cache overflow: evict least recently used chunk */
|
||||
FileCacheEntry* victim = dlist_container(FileCacheEntry, lru_node, dlist_pop_head_node(&lfc_ctl->lru));
|
||||
@@ -390,7 +491,10 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
elog(LOG, "Swap file cache page");
|
||||
}
|
||||
else
|
||||
{
|
||||
lfc_ctl->used += 1;
|
||||
entry->offset = lfc_ctl->size++; /* allocate new chunk at end of file */
|
||||
}
|
||||
entry->access_count = 1;
|
||||
memset(entry->bitmap, 0, sizeof entry->bitmap);
|
||||
}
|
||||
|
||||
@@ -1872,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk)
|
||||
return sk->state == SS_ACTIVE;
|
||||
}
|
||||
|
||||
/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */
|
||||
/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */
|
||||
void
|
||||
ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf)
|
||||
ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf)
|
||||
{
|
||||
uint8 nkeys;
|
||||
int i;
|
||||
@@ -1892,45 +1892,45 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->currentClusterSize = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu",
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
|
||||
rf->currentClusterSize);
|
||||
}
|
||||
else if (strcmp(key, "ps_writelsn") == 0)
|
||||
else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->ps_writelsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_writelsn));
|
||||
rf->last_received_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn));
|
||||
}
|
||||
else if (strcmp(key, "ps_flushlsn") == 0)
|
||||
else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->ps_flushlsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_flushlsn));
|
||||
rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
|
||||
}
|
||||
else if (strcmp(key, "ps_applylsn") == 0)
|
||||
else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->ps_applylsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->ps_applylsn));
|
||||
rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
|
||||
}
|
||||
else if (strcmp(key, "ps_replytime") == 0)
|
||||
else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
|
||||
{
|
||||
pq_getmsgint(reply_message, sizeof(int32));
|
||||
/* read value length */
|
||||
rf->ps_replytime = pq_getmsgint64(reply_message);
|
||||
rf->replytime = pq_getmsgint64(reply_message);
|
||||
{
|
||||
char *replyTimeStr;
|
||||
|
||||
/* Copy because timestamptz_to_str returns a static buffer */
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime));
|
||||
elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s",
|
||||
rf->ps_replytime, replyTimeStr);
|
||||
replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
|
||||
elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
|
||||
rf->replytime, replyTimeStr);
|
||||
|
||||
pfree(replyTimeStr);
|
||||
}
|
||||
@@ -1944,7 +1944,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback *
|
||||
* Skip unknown keys to support backward compatibile protocol
|
||||
* changes
|
||||
*/
|
||||
elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
|
||||
pq_getmsgbytes(reply_message, len);
|
||||
};
|
||||
}
|
||||
@@ -2024,7 +2024,7 @@ GetAcknowledgedByQuorumWALPosition(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* ReplicationFeedbackShmemSize --- report amount of shared memory space needed
|
||||
* WalproposerShmemSize --- report amount of shared memory space needed
|
||||
*/
|
||||
Size
|
||||
WalproposerShmemSize(void)
|
||||
@@ -2054,10 +2054,10 @@ WalproposerShmemInit(void)
|
||||
}
|
||||
|
||||
void
|
||||
replication_feedback_set(ReplicationFeedback * rf)
|
||||
replication_feedback_set(PageserverFeedback * rf)
|
||||
{
|
||||
SpinLockAcquire(&walprop_shared->mutex);
|
||||
memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback));
|
||||
memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback));
|
||||
SpinLockRelease(&walprop_shared->mutex);
|
||||
}
|
||||
|
||||
@@ -2065,43 +2065,43 @@ void
|
||||
replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn)
|
||||
{
|
||||
SpinLockAcquire(&walprop_shared->mutex);
|
||||
*writeLsn = walprop_shared->feedback.ps_writelsn;
|
||||
*flushLsn = walprop_shared->feedback.ps_flushlsn;
|
||||
*applyLsn = walprop_shared->feedback.ps_applylsn;
|
||||
*writeLsn = walprop_shared->feedback.last_received_lsn;
|
||||
*flushLsn = walprop_shared->feedback.disk_consistent_lsn;
|
||||
*applyLsn = walprop_shared->feedback.remote_consistent_lsn;
|
||||
SpinLockRelease(&walprop_shared->mutex);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get ReplicationFeedback fields from the most advanced safekeeper
|
||||
* Get PageserverFeedback fields from the most advanced safekeeper
|
||||
*/
|
||||
static void
|
||||
GetLatestNeonFeedback(ReplicationFeedback * rf)
|
||||
GetLatestNeonFeedback(PageserverFeedback * rf)
|
||||
{
|
||||
int latest_safekeeper = 0;
|
||||
XLogRecPtr ps_writelsn = InvalidXLogRecPtr;
|
||||
XLogRecPtr last_received_lsn = InvalidXLogRecPtr;
|
||||
|
||||
for (int i = 0; i < n_safekeepers; i++)
|
||||
{
|
||||
if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn)
|
||||
if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn)
|
||||
{
|
||||
latest_safekeeper = i;
|
||||
ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn;
|
||||
last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn;
|
||||
}
|
||||
}
|
||||
|
||||
rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize;
|
||||
rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn;
|
||||
rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn;
|
||||
rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn;
|
||||
rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime;
|
||||
rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn;
|
||||
rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn;
|
||||
rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
|
||||
rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime;
|
||||
|
||||
elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
|
||||
" ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu",
|
||||
" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
|
||||
rf->currentClusterSize,
|
||||
LSN_FORMAT_ARGS(rf->ps_writelsn),
|
||||
LSN_FORMAT_ARGS(rf->ps_flushlsn),
|
||||
LSN_FORMAT_ARGS(rf->ps_applylsn),
|
||||
rf->ps_replytime);
|
||||
LSN_FORMAT_ARGS(rf->last_received_lsn),
|
||||
LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
|
||||
LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
|
||||
rf->replytime);
|
||||
|
||||
replication_feedback_set(rf);
|
||||
}
|
||||
@@ -2115,16 +2115,16 @@ HandleSafekeeperResponse(void)
|
||||
XLogRecPtr minFlushLsn;
|
||||
|
||||
minQuorumLsn = GetAcknowledgedByQuorumWALPosition();
|
||||
diskConsistentLsn = quorumFeedback.rf.ps_flushlsn;
|
||||
diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn;
|
||||
|
||||
if (!syncSafekeepers)
|
||||
{
|
||||
/* Get ReplicationFeedback fields from the most advanced safekeeper */
|
||||
/* Get PageserverFeedback fields from the most advanced safekeeper */
|
||||
GetLatestNeonFeedback(&quorumFeedback.rf);
|
||||
SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize);
|
||||
}
|
||||
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn)
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn)
|
||||
{
|
||||
|
||||
if (minQuorumLsn > quorumFeedback.flushLsn)
|
||||
@@ -2142,7 +2142,7 @@ HandleSafekeeperResponse(void)
|
||||
* apply_lsn - This is what processed and durably saved at*
|
||||
* pageserver.
|
||||
*/
|
||||
quorumFeedback.rf.ps_flushlsn,
|
||||
quorumFeedback.rf.disk_consistent_lsn,
|
||||
GetCurrentTimestamp(), false);
|
||||
}
|
||||
|
||||
@@ -2326,7 +2326,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
|
||||
msg->hs.xmin.value = pq_getmsgint64_le(&s);
|
||||
msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s);
|
||||
if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE)
|
||||
ParseReplicationFeedbackMessage(&s, &msg->rf);
|
||||
ParsePageserverFeedbackMessage(&s, &msg->rf);
|
||||
pq_getmsgend(&s);
|
||||
return true;
|
||||
}
|
||||
@@ -2462,7 +2462,7 @@ backpressure_lag_impl(void)
|
||||
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
|
||||
#define MB ((XLogRecPtr)1024 * 1024)
|
||||
|
||||
elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X",
|
||||
LSN_FORMAT_ARGS(myFlushLsn),
|
||||
LSN_FORMAT_ARGS(writePtr),
|
||||
LSN_FORMAT_ARGS(flushPtr),
|
||||
|
||||
@@ -280,21 +280,21 @@ typedef struct HotStandbyFeedback
|
||||
FullTransactionId catalog_xmin;
|
||||
} HotStandbyFeedback;
|
||||
|
||||
typedef struct ReplicationFeedback
|
||||
typedef struct PageserverFeedback
|
||||
{
|
||||
/* current size of the timeline on pageserver */
|
||||
uint64 currentClusterSize;
|
||||
/* standby_status_update fields that safekeeper received from pageserver */
|
||||
XLogRecPtr ps_writelsn;
|
||||
XLogRecPtr ps_flushlsn;
|
||||
XLogRecPtr ps_applylsn;
|
||||
TimestampTz ps_replytime;
|
||||
} ReplicationFeedback;
|
||||
XLogRecPtr last_received_lsn;
|
||||
XLogRecPtr disk_consistent_lsn;
|
||||
XLogRecPtr remote_consistent_lsn;
|
||||
TimestampTz replytime;
|
||||
} PageserverFeedback;
|
||||
|
||||
typedef struct WalproposerShmemState
|
||||
{
|
||||
slock_t mutex;
|
||||
ReplicationFeedback feedback;
|
||||
PageserverFeedback feedback;
|
||||
term_t mineLastElectedTerm;
|
||||
pg_atomic_uint64 backpressureThrottlingTime;
|
||||
} WalproposerShmemState;
|
||||
@@ -320,10 +320,10 @@ typedef struct AppendResponse
|
||||
/* Feedback recieved from pageserver includes standby_status_update fields */
|
||||
/* and custom neon feedback. */
|
||||
/* This part of the message is extensible. */
|
||||
ReplicationFeedback rf;
|
||||
PageserverFeedback rf;
|
||||
} AppendResponse;
|
||||
|
||||
/* ReplicationFeedback is extensible part of the message that is parsed separately */
|
||||
/* PageserverFeedback is extensible part of the message that is parsed separately */
|
||||
/* Other fields are fixed part */
|
||||
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
|
||||
|
||||
@@ -383,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]);
|
||||
extern void WalProposerMain(Datum main_arg);
|
||||
extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
extern void WalProposerPoll(void);
|
||||
extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
|
||||
ReplicationFeedback *rf);
|
||||
extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
|
||||
PageserverFeedback *rf);
|
||||
extern void StartProposerReplication(StartReplicationCmd *cmd);
|
||||
|
||||
extern Size WalproposerShmemSize(void);
|
||||
extern bool WalproposerShmemInit(void);
|
||||
extern void replication_feedback_set(ReplicationFeedback *rf);
|
||||
extern void replication_feedback_set(PageserverFeedback *rf);
|
||||
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
/* libpqwalproposer hooks & helper type */
|
||||
|
||||
@@ -140,7 +140,7 @@ async fn auth_quirks(
|
||||
|
||||
impl BackendType<'_, ClientCredentials<'_>> {
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
#[tracing::instrument(fields(allow_cleartext), skip_all)]
|
||||
#[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)]
|
||||
pub async fn authenticate(
|
||||
&mut self,
|
||||
extra: &ConsoleReqExtra<'_>,
|
||||
|
||||
@@ -98,7 +98,7 @@ pub async fn task_main(
|
||||
}
|
||||
|
||||
// TODO(tech debt): unite this with its twin below.
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
pub async fn handle_ws_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
@@ -140,7 +140,7 @@ pub async fn handle_ws_client(
|
||||
.await
|
||||
}
|
||||
|
||||
#[tracing::instrument(fields(session_id), skip_all)]
|
||||
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
|
||||
async fn handle_client(
|
||||
config: &'static ProxyConfig,
|
||||
cancel_map: &CancelMap,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
[toolchain]
|
||||
channel = "1.66.1"
|
||||
channel = "1.68.2"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result};
|
||||
use clap::Parser;
|
||||
use remote_storage::RemoteStorageConfig;
|
||||
use toml_edit::Document;
|
||||
use utils::signals::ShutdownSignals;
|
||||
|
||||
use std::fs::{self, File};
|
||||
use std::io::{ErrorKind, Write};
|
||||
@@ -39,7 +40,7 @@ use utils::{
|
||||
logging::{self, LogFormat},
|
||||
project_git_version,
|
||||
sentry_init::init_sentry,
|
||||
signals, tcp_listener,
|
||||
tcp_listener,
|
||||
};
|
||||
|
||||
const PID_FILE_NAME: &str = "safekeeper.pid";
|
||||
@@ -216,7 +217,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let timeline_collector = safekeeper::metrics::TimelineCollector::new();
|
||||
metrics::register_internal(Box::new(timeline_collector))?;
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100);
|
||||
|
||||
@@ -274,15 +274,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
// TODO: put more thoughts into handling of failed threads
|
||||
// We probably should restart them.
|
||||
// We should catch & die if they are in trouble.
|
||||
|
||||
// NOTE: we still have to handle signals like SIGQUIT to prevent coredumps
|
||||
signals.handle(|signal| {
|
||||
// TODO: implement graceful shutdown with joining threads etc
|
||||
info!(
|
||||
"received {}, terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
// On any shutdown signal, log receival and exit. Additionally, handling
|
||||
// SIGQUIT prevents coredump.
|
||||
ShutdownSignals::handle(|signal| {
|
||||
info!("received {}, terminating", signal.name());
|
||||
std::process::exit(0);
|
||||
})
|
||||
}
|
||||
|
||||
@@ -242,6 +242,7 @@ async fn record_safekeeper_info(mut request: Request<Body>) -> Result<Response<B
|
||||
safekeeper_connstr: sk_info.safekeeper_connstr.unwrap_or_else(|| "".to_owned()),
|
||||
backup_lsn: sk_info.backup_lsn.0,
|
||||
local_start_lsn: sk_info.local_start_lsn.0,
|
||||
availability_zone: None,
|
||||
};
|
||||
|
||||
let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
|
||||
|
||||
@@ -255,7 +255,7 @@ pub struct TimelineCollector {
|
||||
epoch_start_lsn: GenericGaugeVec<AtomicU64>,
|
||||
peer_horizon_lsn: GenericGaugeVec<AtomicU64>,
|
||||
remote_consistent_lsn: GenericGaugeVec<AtomicU64>,
|
||||
feedback_ps_write_lsn: GenericGaugeVec<AtomicU64>,
|
||||
ps_last_received_lsn: GenericGaugeVec<AtomicU64>,
|
||||
feedback_last_time_seconds: GenericGaugeVec<AtomicU64>,
|
||||
timeline_active: GenericGaugeVec<AtomicU64>,
|
||||
wal_backup_active: GenericGaugeVec<AtomicU64>,
|
||||
@@ -339,15 +339,15 @@ impl TimelineCollector {
|
||||
.unwrap();
|
||||
descs.extend(remote_consistent_lsn.desc().into_iter().cloned());
|
||||
|
||||
let feedback_ps_write_lsn = GenericGaugeVec::new(
|
||||
let ps_last_received_lsn = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
"safekeeper_feedback_ps_write_lsn",
|
||||
"safekeeper_ps_last_received_lsn",
|
||||
"Last LSN received by the pageserver, acknowledged in the feedback",
|
||||
),
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.unwrap();
|
||||
descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned());
|
||||
descs.extend(ps_last_received_lsn.desc().into_iter().cloned());
|
||||
|
||||
let feedback_last_time_seconds = GenericGaugeVec::new(
|
||||
Opts::new(
|
||||
@@ -458,7 +458,7 @@ impl TimelineCollector {
|
||||
epoch_start_lsn,
|
||||
peer_horizon_lsn,
|
||||
remote_consistent_lsn,
|
||||
feedback_ps_write_lsn,
|
||||
ps_last_received_lsn,
|
||||
feedback_last_time_seconds,
|
||||
timeline_active,
|
||||
wal_backup_active,
|
||||
@@ -489,7 +489,7 @@ impl Collector for TimelineCollector {
|
||||
self.epoch_start_lsn.reset();
|
||||
self.peer_horizon_lsn.reset();
|
||||
self.remote_consistent_lsn.reset();
|
||||
self.feedback_ps_write_lsn.reset();
|
||||
self.ps_last_received_lsn.reset();
|
||||
self.feedback_last_time_seconds.reset();
|
||||
self.timeline_active.reset();
|
||||
self.wal_backup_active.reset();
|
||||
@@ -514,11 +514,11 @@ impl Collector for TimelineCollector {
|
||||
let timeline_id = tli.ttid.timeline_id.to_string();
|
||||
let labels = &[tenant_id.as_str(), timeline_id.as_str()];
|
||||
|
||||
let mut most_advanced: Option<pq_proto::ReplicationFeedback> = None;
|
||||
let mut most_advanced: Option<pq_proto::PageserverFeedback> = None;
|
||||
for replica in tli.replicas.iter() {
|
||||
if let Some(replica_feedback) = replica.pageserver_feedback {
|
||||
if let Some(current) = most_advanced {
|
||||
if current.ps_writelsn < replica_feedback.ps_writelsn {
|
||||
if current.last_received_lsn < replica_feedback.last_received_lsn {
|
||||
most_advanced = Some(replica_feedback);
|
||||
}
|
||||
} else {
|
||||
@@ -568,11 +568,10 @@ impl Collector for TimelineCollector {
|
||||
.set(tli.wal_storage.flush_wal_seconds);
|
||||
|
||||
if let Some(feedback) = most_advanced {
|
||||
self.feedback_ps_write_lsn
|
||||
self.ps_last_received_lsn
|
||||
.with_label_values(labels)
|
||||
.set(feedback.ps_writelsn);
|
||||
if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH)
|
||||
{
|
||||
.set(feedback.last_received_lsn);
|
||||
if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) {
|
||||
self.feedback_last_time_seconds
|
||||
.with_label_values(labels)
|
||||
.set(unix_time.as_secs());
|
||||
@@ -599,7 +598,7 @@ impl Collector for TimelineCollector {
|
||||
mfs.extend(self.epoch_start_lsn.collect());
|
||||
mfs.extend(self.peer_horizon_lsn.collect());
|
||||
mfs.extend(self.remote_consistent_lsn.collect());
|
||||
mfs.extend(self.feedback_ps_write_lsn.collect());
|
||||
mfs.extend(self.ps_last_received_lsn.collect());
|
||||
mfs.extend(self.feedback_last_time_seconds.collect());
|
||||
mfs.extend(self.timeline_active.collect());
|
||||
mfs.extend(self.wal_backup_active.collect());
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::control_file;
|
||||
use crate::send_wal::HotStandbyFeedback;
|
||||
|
||||
use crate::wal_storage;
|
||||
use pq_proto::{ReplicationFeedback, SystemId};
|
||||
use pq_proto::{PageserverFeedback, SystemId};
|
||||
use utils::{
|
||||
bin_ser::LeSer,
|
||||
id::{NodeId, TenantId, TenantTimelineId, TimelineId},
|
||||
@@ -360,7 +360,7 @@ pub struct AppendResponse {
|
||||
// a criterion for walproposer --sync mode exit
|
||||
pub commit_lsn: Lsn,
|
||||
pub hs_feedback: HotStandbyFeedback,
|
||||
pub pageserver_feedback: ReplicationFeedback,
|
||||
pub pageserver_feedback: PageserverFeedback,
|
||||
}
|
||||
|
||||
impl AppendResponse {
|
||||
@@ -370,7 +370,7 @@ impl AppendResponse {
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
pageserver_feedback: ReplicationFeedback::empty(),
|
||||
pageserver_feedback: PageserverFeedback::empty(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -708,7 +708,7 @@ where
|
||||
commit_lsn: self.state.commit_lsn,
|
||||
// will be filled by the upper code to avoid bothering safekeeper
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
pageserver_feedback: ReplicationFeedback::empty(),
|
||||
pageserver_feedback: PageserverFeedback::empty(),
|
||||
};
|
||||
trace!("formed AppendResponse {:?}", ar);
|
||||
ar
|
||||
|
||||
@@ -11,7 +11,7 @@ use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
|
||||
use postgres_ffi::get_current_timestamp;
|
||||
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
|
||||
use pq_proto::{BeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody};
|
||||
use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
|
||||
@@ -319,11 +319,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
|
||||
// pageserver sends this.
|
||||
// Note: deserializing is on m[9..] because we skip the tag byte and len bytes.
|
||||
let buf = Bytes::copy_from_slice(&msg[9..]);
|
||||
let reply = ReplicationFeedback::parse(buf);
|
||||
let reply = PageserverFeedback::parse(buf);
|
||||
|
||||
trace!("ReplicationFeedback is {:?}", reply);
|
||||
// Only pageserver sends ReplicationFeedback, so set the flag.
|
||||
// This replica is the source of information to resend to compute.
|
||||
trace!("PageserverFeedback is {:?}", reply);
|
||||
self.feedback.pageserver_feedback = Some(reply);
|
||||
|
||||
self.tli
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use parking_lot::{Mutex, MutexGuard};
|
||||
use postgres_ffi::XLogSegNo;
|
||||
use pq_proto::ReplicationFeedback;
|
||||
use pq_proto::PageserverFeedback;
|
||||
use serde::Serialize;
|
||||
use std::cmp::{max, min};
|
||||
use std::path::PathBuf;
|
||||
@@ -91,7 +91,7 @@ pub struct ReplicaState {
|
||||
/// combined hot standby feedback from all replicas
|
||||
pub hs_feedback: HotStandbyFeedback,
|
||||
/// Replication specific feedback received from pageserver, if any
|
||||
pub pageserver_feedback: Option<ReplicationFeedback>,
|
||||
pub pageserver_feedback: Option<PageserverFeedback>,
|
||||
}
|
||||
|
||||
impl Default for ReplicaState {
|
||||
@@ -276,7 +276,7 @@ impl SharedState {
|
||||
//
|
||||
if let Some(pageserver_feedback) = state.pageserver_feedback {
|
||||
if let Some(acc_feedback) = acc.pageserver_feedback {
|
||||
if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn {
|
||||
if acc_feedback.last_received_lsn < pageserver_feedback.last_received_lsn {
|
||||
warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet.");
|
||||
acc.pageserver_feedback = Some(pageserver_feedback);
|
||||
}
|
||||
@@ -287,12 +287,12 @@ impl SharedState {
|
||||
// last lsn received by pageserver
|
||||
// FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver.
|
||||
// See https://github.com/neondatabase/neon/issues/1171
|
||||
acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn);
|
||||
acc.last_received_lsn = Lsn::from(pageserver_feedback.last_received_lsn);
|
||||
|
||||
// When at least one pageserver has preserved data up to remote_consistent_lsn,
|
||||
// safekeeper is free to delete it, so choose max of all pageservers.
|
||||
acc.remote_consistent_lsn = max(
|
||||
Lsn::from(pageserver_feedback.ps_applylsn),
|
||||
Lsn::from(pageserver_feedback.remote_consistent_lsn),
|
||||
acc.remote_consistent_lsn,
|
||||
);
|
||||
}
|
||||
@@ -337,6 +337,7 @@ impl SharedState {
|
||||
safekeeper_connstr: conf.listen_pg_addr.clone(),
|
||||
backup_lsn: self.sk.inmem.backup_lsn.0,
|
||||
local_start_lsn: self.sk.state.local_start_lsn.0,
|
||||
availability_zone: conf.availability_zone.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -584,7 +585,7 @@ impl Timeline {
|
||||
let replica_state = shared_state.replicas[replica_id].unwrap();
|
||||
let reported_remote_consistent_lsn = replica_state
|
||||
.pageserver_feedback
|
||||
.map(|f| Lsn(f.ps_applylsn))
|
||||
.map(|f| Lsn(f.remote_consistent_lsn))
|
||||
.unwrap_or(Lsn::INVALID);
|
||||
let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
|
||||
(reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
|
||||
|
||||
2
scripts/sk_collect_dumps/.gitignore
vendored
Normal file
2
scripts/sk_collect_dumps/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
result
|
||||
*.json
|
||||
25
scripts/sk_collect_dumps/readme.md
Normal file
25
scripts/sk_collect_dumps/readme.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Collect /v1/debug_dump from all safekeeper nodes
|
||||
|
||||
1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory.
|
||||
2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database.
|
||||
|
||||
## How to use ansible (staging)
|
||||
|
||||
```
|
||||
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
|
||||
AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
```
|
||||
|
||||
## How to use ansible (prod)
|
||||
|
||||
```
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
|
||||
AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml
|
||||
```
|
||||
|
||||
18
scripts/sk_collect_dumps/remote.yaml
Normal file
18
scripts/sk_collect_dumps/remote.yaml
Normal file
@@ -0,0 +1,18 @@
|
||||
- name: Fetch state dumps from safekeepers
|
||||
hosts: safekeepers
|
||||
gather_facts: False
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
tasks:
|
||||
- name: Download file
|
||||
get_url:
|
||||
url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false"
|
||||
dest: "/tmp/{{ inventory_hostname }}.json"
|
||||
|
||||
- name: Fetch file from remote hosts
|
||||
fetch:
|
||||
src: "/tmp/{{ inventory_hostname }}.json"
|
||||
dest: "./result/{{ inventory_hostname }}.json"
|
||||
flat: yes
|
||||
fail_on_missing: no
|
||||
|
||||
52
scripts/sk_collect_dumps/upload.sh
Executable file
52
scripts/sk_collect_dumps/upload.sh
Executable file
@@ -0,0 +1,52 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -z "$DB_CONNSTR" ]; then
|
||||
echo "DB_CONNSTR is not set"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create a temporary table for JSON data
|
||||
psql $DB_CONNSTR -c 'DROP TABLE IF EXISTS tmp_json'
|
||||
psql $DB_CONNSTR -c 'CREATE TABLE tmp_json (data jsonb)'
|
||||
|
||||
for file in ./result/*.json; do
|
||||
echo "$file"
|
||||
SK_ID=$(jq '.config.id' $file)
|
||||
echo "SK_ID: $SK_ID"
|
||||
jq -c ".timelines[] | . + {\"sk_id\": $SK_ID}" $file | psql $DB_CONNSTR -c "\\COPY tmp_json (data) FROM STDIN"
|
||||
done
|
||||
|
||||
TABLE_NAME=$1
|
||||
|
||||
if [ -z "$TABLE_NAME" ]; then
|
||||
echo "TABLE_NAME is not set, skipping conversion to table with typed columns"
|
||||
echo "Usage: ./upload.sh TABLE_NAME"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
psql $DB_CONNSTR <<EOF
|
||||
CREATE TABLE $TABLE_NAME AS
|
||||
SELECT
|
||||
(data->>'sk_id')::bigint AS sk_id,
|
||||
(data->>'tenant_id') AS tenant_id,
|
||||
(data->>'timeline_id') AS timeline_id,
|
||||
(data->'memory'->>'active')::bool AS active,
|
||||
(data->'memory'->>'flush_lsn')::bigint AS flush_lsn,
|
||||
(data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn,
|
||||
(data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn,
|
||||
(data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn,
|
||||
(data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn,
|
||||
(data->'memory'->>'write_lsn')::bigint AS write_lsn,
|
||||
(data->'memory'->>'num_computes')::bigint AS num_computes,
|
||||
(data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn,
|
||||
(data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno,
|
||||
(data->'memory'->>'is_cancelled')::bool AS is_cancelled,
|
||||
(data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn,
|
||||
(data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn,
|
||||
(data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term,
|
||||
(data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn,
|
||||
(data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn,
|
||||
(data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn,
|
||||
(data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn
|
||||
FROM tmp_json
|
||||
EOF
|
||||
@@ -133,6 +133,7 @@ async fn publish(client: Option<BrokerClientChannel>, n_keys: u64) {
|
||||
peer_horizon_lsn: 5,
|
||||
safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
availability_zone: None,
|
||||
};
|
||||
counter += 1;
|
||||
yield info;
|
||||
|
||||
@@ -36,9 +36,11 @@ message SafekeeperTimelineInfo {
|
||||
uint64 local_start_lsn = 9;
|
||||
// A connection string to use for WAL receiving.
|
||||
string safekeeper_connstr = 10;
|
||||
// Availability zone of a safekeeper.
|
||||
optional string availability_zone = 11;
|
||||
}
|
||||
|
||||
message TenantTimelineId {
|
||||
bytes tenant_id = 1;
|
||||
bytes timeline_id = 2;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ use tonic::transport::server::Connected;
|
||||
use tonic::Code;
|
||||
use tonic::{Request, Response, Status};
|
||||
use tracing::*;
|
||||
use utils::signals::ShutdownSignals;
|
||||
|
||||
use metrics::{Encoder, TextEncoder};
|
||||
use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE};
|
||||
@@ -437,6 +438,14 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
info!("version: {GIT_VERSION}");
|
||||
::metrics::set_build_info_metric(GIT_VERSION);
|
||||
|
||||
// On any shutdown signal, log receival and exit.
|
||||
std::thread::spawn(move || {
|
||||
ShutdownSignals::handle(|signal| {
|
||||
info!("received {}, terminating", signal.name());
|
||||
std::process::exit(0);
|
||||
})
|
||||
});
|
||||
|
||||
let registry = Registry {
|
||||
shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))),
|
||||
timeline_chan_size: args.timeline_chan_size,
|
||||
@@ -516,6 +525,7 @@ mod tests {
|
||||
peer_horizon_lsn: 5,
|
||||
safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(),
|
||||
local_start_lsn: 0,
|
||||
availability_zone: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
env.pageserver.allowed_errors.append(".*Timeline .* was not found.*")
|
||||
env.pageserver.allowed_errors.append(".*timeline not found.*")
|
||||
env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*")
|
||||
env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
|
||||
env.pageserver.allowed_errors.append(".*Precondition failed: Requested tenant is missing.*")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
@@ -24,11 +24,11 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
|
||||
invalid_tenant_id = TenantId.generate()
|
||||
with pytest.raises(
|
||||
PageserverApiException,
|
||||
match=f"NotFound: tenant {invalid_tenant_id}",
|
||||
match="Precondition failed: Requested tenant is missing",
|
||||
) as exc:
|
||||
ps_http.timeline_delete(tenant_id=invalid_tenant_id, timeline_id=invalid_timeline_id)
|
||||
|
||||
assert exc.value.status_code == 404
|
||||
assert exc.value.status_code == 412
|
||||
|
||||
# construct pair of branches to validate that pageserver prohibits
|
||||
# deletion of ancestor timelines when they have child branches
|
||||
|
||||
2
vendor/postgres-v14
vendored
2
vendor/postgres-v14
vendored
Submodule vendor/postgres-v14 updated: 9fd9794436...757df1dab8
2
vendor/postgres-v15
vendored
2
vendor/postgres-v15
vendored
Submodule vendor/postgres-v15 updated: 257aaefb25...f8a650e49b
Reference in New Issue
Block a user