mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
@@ -4,7 +4,7 @@
|
||||
hakari-package = "workspace_hack"
|
||||
|
||||
# Format for `workspace-hack = ...` lines in other Cargo.tomls. Requires cargo-hakari 0.9.8 or above.
|
||||
dep-format-version = "2"
|
||||
dep-format-version = "3"
|
||||
|
||||
# Setting workspace.resolver = "2" in the root Cargo.toml is HIGHLY recommended.
|
||||
# Hakari works much better with the new feature resolver.
|
||||
|
||||
@@ -6,6 +6,8 @@ storage:
|
||||
broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
2
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
2
.github/ansible/prod.eu-central-1.hosts.yaml
vendored
@@ -6,6 +6,8 @@ storage:
|
||||
broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
4
.github/ansible/prod.us-east-2.hosts.yaml
vendored
@@ -6,6 +6,8 @@ storage:
|
||||
broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
@@ -34,4 +36,4 @@ storage:
|
||||
ansible_host: i-06d113fb73bfddeb0
|
||||
safekeeper-2.us-east-2.aws.neon.tech:
|
||||
ansible_host: i-09f66c8e04afff2e8
|
||||
|
||||
|
||||
|
||||
2
.github/ansible/prod.us-west-2.hosts.yaml
vendored
2
.github/ansible/prod.us-west-2.hosts.yaml
vendored
@@ -6,6 +6,8 @@ storage:
|
||||
broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
2
.github/ansible/production.hosts.yaml
vendored
2
.github/ansible/production.hosts.yaml
vendored
@@ -7,6 +7,8 @@ storage:
|
||||
broker_endpoint: http://storage-broker.prod.local:50051
|
||||
pageserver_config_stub:
|
||||
pg_distrib_dir: /usr/local
|
||||
metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
|
||||
metric_collection_interval: 10min
|
||||
remote_storage:
|
||||
bucket_name: "{{ bucket_name }}"
|
||||
bucket_region: "{{ bucket_region }}"
|
||||
|
||||
@@ -10,6 +10,8 @@ settings:
|
||||
domain: "*.eu-west-1.aws.neon.build"
|
||||
sentryEnvironment: "development"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -9,6 +9,8 @@ settings:
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
sentryEnvironment: "development"
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy-link pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ settings:
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
sentryEnvironment: "development"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ settings:
|
||||
domain: "*.us-east-2.aws.neon.build"
|
||||
sentryEnvironment: "development"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "1min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ settings:
|
||||
domain: "*.ap-southeast-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ settings:
|
||||
domain: "*.eu-central-1.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ settings:
|
||||
domain: "*.us-east-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -10,6 +10,8 @@ settings:
|
||||
domain: "*.us-west-2.aws.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
# -- Additional labels for neon-proxy pods
|
||||
podLabels:
|
||||
|
||||
@@ -4,6 +4,8 @@ settings:
|
||||
domain: "*.cloud.neon.tech"
|
||||
sentryEnvironment: "production"
|
||||
wssPort: 8443
|
||||
metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
|
||||
metricCollectionInterval: "10min"
|
||||
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
|
||||
105
.github/workflows/benchmarking.yml
vendored
105
.github/workflows/benchmarking.yml
vendored
@@ -489,3 +489,108 @@ jobs:
|
||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
user-examples-compare:
|
||||
if: success() || failure()
|
||||
needs: [ tpch-compare ]
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest-prefetch: We have pre-created projects with prefetch enabled
|
||||
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
|
||||
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
|
||||
platform: [ neon-captest-prefetch, rds-postgres, rds-aurora ]
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Set up Connection String
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest-prefetch)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_AURORA_CONNSTR }}
|
||||
;;
|
||||
rds-postgres)
|
||||
CONNSTR=${{ secrets.BENCHMARK_USER_EXAMPLE_RDS_POSTGRES_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
|
||||
|
||||
psql ${CONNSTR} -c "SELECT version();"
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
DB_NAME=$(psql ${BENCHMARK_CONNSTR} --no-align --quiet -t -c "SELECT current_database()")
|
||||
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET effective_io_concurrency=32"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE ${DB_NAME} SET maintenance_io_concurrency=32"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Run user examples
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Create Allure report
|
||||
if: success() || failure()
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
|
||||
17
.github/workflows/build_and_test.yml
vendored
17
.github/workflows/build_and_test.yml
vendored
@@ -595,6 +595,8 @@ jobs:
|
||||
defaults:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_INFORMANT_VERSION: 0.1.1
|
||||
|
||||
steps:
|
||||
- name: Downloading latest vm-builder
|
||||
@@ -606,9 +608,22 @@ jobs:
|
||||
run: |
|
||||
docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
|
||||
run: |
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
|
||||
chmod +x vm-informant
|
||||
|
||||
- name: Adding VM informant to compute-node image
|
||||
run: |
|
||||
ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
|
||||
docker cp vm-informant $ID:/bin/vm-informant
|
||||
docker commit $ID temp-vm-compute-node
|
||||
docker rm -f $ID
|
||||
|
||||
- name: Build vm image
|
||||
run: |
|
||||
./vm-builder -src=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
# note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
|
||||
./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Pushing vm-compute-node image
|
||||
run: |
|
||||
|
||||
531
Cargo.lock
generated
531
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
154
Cargo.toml
154
Cargo.toml
@@ -1,14 +1,3 @@
|
||||
# 'named-profiles' feature was stabilized in cargo 1.57. This line makes the
|
||||
# build work with older cargo versions.
|
||||
#
|
||||
# We have this because as of this writing, the latest cargo Debian package
|
||||
# that's available is 1.56. (Confusingly, the Debian package version number
|
||||
# is 0.57, whereas 'cargo --version' says 1.56.)
|
||||
#
|
||||
# See https://tracker.debian.org/pkg/cargo for the current status of the
|
||||
# package. When that gets updated, we can remove this.
|
||||
cargo-features = ["named-profiles"]
|
||||
|
||||
[workspace]
|
||||
members = [
|
||||
"compute_tools",
|
||||
@@ -21,6 +10,143 @@ members = [
|
||||
"libs/*",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
## All dependency versions, used in the project
|
||||
[workspace.dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
atty = "0.2.14"
|
||||
aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "0.21.0"
|
||||
aws-smithy-http = "0.51.0"
|
||||
aws-types = "0.51.0"
|
||||
base64 = "0.13.0"
|
||||
bincode = "1.3"
|
||||
bindgen = "0.61"
|
||||
bstr = "1.0"
|
||||
byteorder = "1.4"
|
||||
bytes = "1.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = "4.0"
|
||||
close_fds = "0.3.2"
|
||||
comfy-table = "6.1"
|
||||
const_format = "0.2"
|
||||
crc32c = "0.6"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
fs2 = "0.4.3"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3"
|
||||
hashbrown = "0.13"
|
||||
hex = "0.4"
|
||||
hex-literal = "0.3"
|
||||
hmac = "0.12.1"
|
||||
hostname = "0.3.1"
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.9"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "8"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
nix = "0.26"
|
||||
notify = "5.0.0"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2"
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.11"
|
||||
rand = "0.8"
|
||||
regex = "1.4"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
routerify = "3"
|
||||
rstar = "0.9.3"
|
||||
rustls = "0.20"
|
||||
rustls-pemfile = "1"
|
||||
rustls-split = "0.3"
|
||||
scopeguard = "1.1"
|
||||
sentry = { version = "0.29", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
sha2 = "0.10.2"
|
||||
signal-hook = "0.3"
|
||||
socket2 = "0.4.4"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
svg_fmt = "0.4.1"
|
||||
tar = "0.4"
|
||||
thiserror = "1.0"
|
||||
tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-postgres-rustls = "0.9.0"
|
||||
tokio-rustls = "0.23"
|
||||
tokio-stream = "0.1"
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
toml = "0.5"
|
||||
toml_edit = { version = "0.17", features = ["easy"] }
|
||||
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
url = "2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
walkdir = "2.3.2"
|
||||
webpki-roots = "0.22.5"
|
||||
x509-parser = "0.14"
|
||||
|
||||
## TODO replace this with tracing
|
||||
env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
## TODO switch when the new release is made
|
||||
amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
|
||||
|
||||
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
|
||||
## Local libraries
|
||||
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
|
||||
metrics = { version = "0.1", path = "./libs/metrics/" }
|
||||
pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
|
||||
postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" }
|
||||
postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" }
|
||||
pq_proto = { version = "0.1", path = "./libs/pq_proto/" }
|
||||
remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
|
||||
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
|
||||
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
## Build dependencies
|
||||
criterion = "0.4"
|
||||
rcgen = "0.10"
|
||||
rstest = "0.16"
|
||||
tempfile = "3.2"
|
||||
tonic-build = "0.8"
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
|
||||
################# Binary contents sections
|
||||
|
||||
[profile.release]
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
# Besides, debug info should not affect the performance.
|
||||
@@ -81,9 +207,3 @@ inherits = "release"
|
||||
debug = false # true = 2 = all symbols, 1 = line only
|
||||
opt-level = "z"
|
||||
lto = true
|
||||
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
|
||||
@@ -29,7 +29,13 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -55,7 +61,9 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -29,7 +29,13 @@ RUN cd postgres && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
|
||||
# Enable some of contrib extensions
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -55,7 +61,9 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
|
||||
@@ -1,24 +1,25 @@
|
||||
[package]
|
||||
name = "compute_tools"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
clap = "4.0"
|
||||
env_logger = "0.9"
|
||||
futures = "0.3.13"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
log = { version = "0.4", features = ["std", "serde"] }
|
||||
notify = "5.0.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
regex = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tar = "0.4"
|
||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
url = "2.2.2"
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
anyhow.workspace = true
|
||||
chrono.workspace = true
|
||||
clap.workspace = true
|
||||
env_logger.workspace = true
|
||||
futures.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
log = { workspace = true, features = ["std", "serde"] }
|
||||
notify.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tar.workspace = true
|
||||
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tokio-postgres.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -19,6 +19,10 @@ Also `compute_ctl` spawns two separate service threads:
|
||||
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
last activity requests.
|
||||
|
||||
If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
||||
compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
||||
downscaling and (eventually) will request immediate upscaling under resource pressure.
|
||||
|
||||
Usage example:
|
||||
```sh
|
||||
compute_ctl -D /var/db/postgres/compute \
|
||||
|
||||
@@ -18,6 +18,10 @@
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! If the `vm-informant` binary is present at `/bin/vm-informant`, it will also be started. For VM
|
||||
//! compute nodes, `vm-informant` communicates with the VM autoscaling system. It coordinates
|
||||
//! downscaling and (eventually) will request immediate upscaling under resource pressure.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
@@ -40,6 +44,7 @@ use log::{error, info};
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::informant::spawn_vm_informant_if_present;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
@@ -114,6 +119,8 @@ fn main() -> Result<()> {
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
// Also spawn the thread responsible for handling the VM informant -- if it's present
|
||||
let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
|
||||
|
||||
// Run compute (Postgres) and hang waiting on it.
|
||||
match compute.prepare_and_run() {
|
||||
|
||||
50
compute_tools/src/informant.rs
Normal file
50
compute_tools/src/informant.rs
Normal file
@@ -0,0 +1,50 @@
|
||||
use log::{info, warn};
|
||||
use std::path::Path;
|
||||
use std::process;
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
|
||||
const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
|
||||
|
||||
/// Launch a thread to start the VM informant if it's present (and restart, on failure)
|
||||
pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
|
||||
let exists = Path::new(VM_INFORMANT_PATH)
|
||||
.try_exists()
|
||||
.context("could not check if path exists")?;
|
||||
|
||||
if !exists {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some(
|
||||
thread::Builder::new()
|
||||
.name("run-vm-informant".into())
|
||||
.spawn(move || run_informant())?,
|
||||
))
|
||||
}
|
||||
|
||||
fn run_informant() -> ! {
|
||||
let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
|
||||
|
||||
info!("starting VM informant");
|
||||
|
||||
loop {
|
||||
let mut cmd = process::Command::new(VM_INFORMANT_PATH);
|
||||
// Block on subprocess:
|
||||
let result = cmd.status();
|
||||
|
||||
match result {
|
||||
Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
|
||||
Ok(status) if !status.success() => {
|
||||
warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
|
||||
}
|
||||
Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
|
||||
}
|
||||
|
||||
// Wait before retrying
|
||||
thread::sleep(restart_wait);
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,7 @@ pub mod http;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod informant;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
|
||||
@@ -1,32 +1,31 @@
|
||||
[package]
|
||||
name = "control_plane"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "4.0"
|
||||
comfy-table = "6.1"
|
||||
git-version = "0.3.5"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
regex = "1"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "2.0"
|
||||
tar = "0.4.38"
|
||||
thiserror = "1"
|
||||
toml = "0.5"
|
||||
url = "2.2.2"
|
||||
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
git-version.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = ["blocking", "json"] }
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
toml.workspace = true
|
||||
url.workspace = true
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_connection = { path = "../libs/postgres_connection" }
|
||||
safekeeper_api = { path = "../libs/safekeeper_api" }
|
||||
# Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
|
||||
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
pageserver_api.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
storage_broker.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -52,7 +52,7 @@ name = "ring"
|
||||
version = "*"
|
||||
expression = "MIT AND ISC AND OpenSSL"
|
||||
license-files = [
|
||||
{ path = "LICENSE", hash = 0xbd0eed23 },
|
||||
{ path = "LICENSE", hash = 0xbd0eed23 }
|
||||
]
|
||||
|
||||
[licenses.private]
|
||||
|
||||
115
docs/consumption_metrics.md
Normal file
115
docs/consumption_metrics.md
Normal file
@@ -0,0 +1,115 @@
|
||||
### Overview
|
||||
Pageserver and proxy periodically collect consumption metrics and push them to a HTTP endpoint.
|
||||
|
||||
This doc describes current implementation details.
|
||||
For design details see [the RFC](./rfcs/021-metering.md) and [the discussion on Github](https://github.com/neondatabase/neon/pull/2884).
|
||||
|
||||
- The metrics are collected in a separate thread, and the collection interval and endpoint are configurable.
|
||||
|
||||
- Metrics are cached, so that we don't send unchanged metrics on every iteration.
|
||||
|
||||
- Metrics are sent in batches of 1000 (see CHUNK_SIZE const) metrics max with no particular grouping guarantees.
|
||||
|
||||
batch format is
|
||||
```json
|
||||
|
||||
{ "events" : [metric1, metric2, ...]]}
|
||||
|
||||
```
|
||||
See metric format examples below.
|
||||
|
||||
- All metrics values are in bytes, unless otherwise specified.
|
||||
|
||||
- Currently no retries are implemented.
|
||||
|
||||
### Pageserver metrics
|
||||
|
||||
#### Configuration
|
||||
The endpoint and the collection interval are specified in the pageserver config file (or can be passed as command line arguments):
|
||||
`metric_collection_endpoint` defaults to None, which means that metric collection is disabled by default.
|
||||
`metric_collection_interval` defaults to 10min
|
||||
|
||||
#### Metrics
|
||||
|
||||
Currently, the following metrics are collected:
|
||||
|
||||
- `written_size`
|
||||
|
||||
Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
||||
This is an absolute, per-timeline metric.
|
||||
|
||||
- `resident_size`
|
||||
|
||||
Size of all the layer files in the tenant's directory on disk on the pageserver.
|
||||
This is an absolute, per-tenant metric.
|
||||
|
||||
- `remote_storage_size`
|
||||
|
||||
Size of the remote storage (S3) directory.
|
||||
This is an absolute, per-tenant metric.
|
||||
|
||||
- `timeline_logical_size`
|
||||
Logical size of the data in the timeline
|
||||
This is an absolute, per-timeline metric.
|
||||
|
||||
- `synthetic_storage_size`
|
||||
Size of all tenant's branches including WAL
|
||||
This is the same metric that `tenant/{tenant_id}/size` endpoint returns.
|
||||
This is an absolute, per-tenant metric.
|
||||
|
||||
Synthetic storage size is calculated in a separate thread, so it might be slightly outdated.
|
||||
|
||||
#### Format example
|
||||
|
||||
```json
|
||||
{
|
||||
"metric": "remote_storage_size",
|
||||
"type": "absolute",
|
||||
"time": "2022-12-28T11:07:19.317310284Z",
|
||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
"value": 12345454,
|
||||
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
||||
}
|
||||
```
|
||||
|
||||
`idempotency_key` is a unique key for each metric, so that we can deduplicate metrics.
|
||||
It is a combination of the time, node_id and a random number.
|
||||
|
||||
### Proxy consumption metrics
|
||||
|
||||
#### Configuration
|
||||
The endpoint and the collection interval can be passed as command line arguments for proxy:
|
||||
`metric_collection_endpoint` no default, which means that metric collection is disabled by default.
|
||||
`metric_collection_interval` no default
|
||||
|
||||
#### Metrics
|
||||
|
||||
Currently, only one proxy metric is collected:
|
||||
|
||||
- `proxy_io_bytes_per_client`
|
||||
Outbound traffic per client.
|
||||
This is an incremental, per-endpoint metric.
|
||||
|
||||
#### Format example
|
||||
|
||||
```json
|
||||
{
|
||||
"metric": "proxy_io_bytes_per_client",
|
||||
"type": "incremental",
|
||||
"start_time": "2022-12-28T11:07:19.317310284Z",
|
||||
"stop_time": "2022-12-28T11:07:19.317310284Z",
|
||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
"value": 12345454,
|
||||
"endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
}
|
||||
```
|
||||
|
||||
The metric is incremental, so the value is the difference between the current and the previous value.
|
||||
If there is no previous value, the value, the value is the current value and the `start_time` equals `stop_time`.
|
||||
|
||||
### TODO
|
||||
|
||||
- [ ] Handle errors better: currently if one tenant fails to gather metrics, the whole iteration fails and metrics are not sent for any tenant.
|
||||
- [ ] Add retries
|
||||
- [ ] Tune the interval
|
||||
186
docs/rfcs/021-metering.md
Normal file
186
docs/rfcs/021-metering.md
Normal file
@@ -0,0 +1,186 @@
|
||||
# Consumption tracking
|
||||
|
||||
|
||||
# Goals
|
||||
|
||||
This proposal is made with two mostly but not entirely overlapping goals:
|
||||
|
||||
* Collect info that is needed for consumption-based billing
|
||||
* Cross-check AWS bills
|
||||
|
||||
|
||||
# Metrics
|
||||
|
||||
There are six metrics to collect:
|
||||
|
||||
* CPU time. Wall clock seconds * the current number of cores. We have a fixed ratio of memory to cores, so the current memory size is the function of the number of cores. Measured per each `endpoint`.
|
||||
|
||||
* Traffic. In/out traffic on the proxy. Measured per each `endpoint`.
|
||||
|
||||
* Written size. Amount of data we write. That is different from both traffic and storage size, as only during the writing we
|
||||
|
||||
a) occupy some disk bandwidth on safekeepers
|
||||
|
||||
b) necessarily cross AZ boundaries delivering WAL to all safekeepers
|
||||
|
||||
Each timeline/branch has at most one writer, so the data is collected per branch.
|
||||
|
||||
* Synthetic storage size. That is what is exposed now with pageserver's `/v1/tenant/{}/size`. Looks like now it is per-tenant. (Side note: can we make it per branch to show as branch physical size in UI?)
|
||||
|
||||
* Real storage size. That is the size of the tenant directory on the pageservers disk. Per-tenant.
|
||||
|
||||
* S3 storage size. That is the size of the tenant data on S3. Per-tenant.
|
||||
|
||||
That info should be enough to build an internal model that predicts AWS price (hence tracking `written data` and `real storage size`). As for the billing model we probably can get away with mentioning only `CPU time`, `synthetic storage size`, and `traffic` consumption.
|
||||
|
||||
# Services participating in metrics collection
|
||||
|
||||
## Proxy
|
||||
|
||||
For actual implementation details check `/docs/consumption_metrics.md`
|
||||
|
||||
Proxy is the only place that knows about traffic flow, so it tracks it and reports it with quite a small interval, let's say 1 minute. A small interval is needed here since the proxy is stateless, and any restart will reset accumulated consumption. Also proxy should report deltas since the last report, not an absolute value of the counter. Such kind of events is easier to integrate over a period of time to get the amount of traffic during some time interval.
|
||||
|
||||
Example event:
|
||||
|
||||
```json
|
||||
{
|
||||
"metric": "proxy_io_bytes_per_client",
|
||||
"type": "incremental",
|
||||
"start_time": "2022-12-28T11:07:19.317310284Z",
|
||||
"stop_time": "2022-12-28T11:07:19.317310284Z",
|
||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
"value": 12345454,
|
||||
"endpoint_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
}
|
||||
```
|
||||
|
||||
Since we report deltas over some period of time, it makes sense to include `event_start_time`/`event_stop_time` where `event_start_time` is the time of the previous report. That will allow us to identify metering gaps better (e.g., failed send/delivery).
|
||||
|
||||
When there is no active connection proxy can avoid reporting anything. Also, deltas are additive, so several console instances serving the same user and endpoint can report traffic without coordination.
|
||||
|
||||
## Console
|
||||
|
||||
The console knows about start/stop events, so it knows the amount of CPU time allocated to each endpoint. It also knows about operation successes and failures and can avoid billing clients after unsuccessful 'suspend' events. The console doesn't know the current compute size within the allowed limits on the endpoint. So with CPU time, we do the following:
|
||||
|
||||
* While we don't yet have the autoscaling console can report `cpu time` as the number of seconds since the last `start_compute` event.
|
||||
|
||||
* When we have autoscaling, `autoscaler-agent` can report `cpu time`*`compute_units_count` in the same increments as the proxy reports traffic.
|
||||
|
||||
Example event:
|
||||
|
||||
```json
|
||||
{
|
||||
"metric": "effective_compute_seconds",
|
||||
"type": "increment",
|
||||
"endpoint_id": "blazing-warrior-34",
|
||||
"event_start_time": ...,
|
||||
"event_stop_time": ...,
|
||||
"value": 12345454,
|
||||
}
|
||||
```
|
||||
|
||||
I'd also suggest reporting one value, `cpu time`*`compute_units_count`, instead of two separate fields as it makes event schema simpler (it is possible to treat it the same way as traffic) and preserves additivity.
|
||||
|
||||
## Pageserver
|
||||
|
||||
For actual implementation details check `/docs/consumption_metrics.md`
|
||||
|
||||
Pageserver knows / has access to / can calculate the rest of the metrics:
|
||||
|
||||
* Written size -- that is basically `last_received_lsn`,
|
||||
* Synthetic storage size -- there is a way to calculate it, albeit a costly one,
|
||||
* Real storage size -- there is a way to calculate it using a layer map or filesystem,
|
||||
* S3 storage size -- can calculate it by S3 API calls
|
||||
|
||||
Some of those metrics are expensive to calculate, so the reporting period here is driven mainly by implementation details. We can set it to, for example, once per hour. Not a big deal since the pageserver is stateful, and all metrics can be reported as an absolute value, not increments. At the same time, a smaller reporting period improves UX, so it would be good to have something more real-time.
|
||||
|
||||
`written size` is primarily a safekeeper-related metric, but since it is available on both pageserver and safekeeper, we can avoid reporting anything from the safekeeper.
|
||||
|
||||
Example event:
|
||||
|
||||
```json
|
||||
{
|
||||
"metric": "remote_storage_size",
|
||||
"type": "absolute",
|
||||
"time": "2022-12-28T11:07:19.317310284Z",
|
||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
"value": 12345454,
|
||||
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
||||
}
|
||||
```
|
||||
|
||||
# Data collection
|
||||
|
||||
## Push vs. pull
|
||||
|
||||
We already have pull-based Prometheus metrics, so it is tempting to use them here too. However, in our setup, it is hard to tell when some metric changes. For example, garbage collection will constantly free some disk space over a week, even if the project is down for that week. We could also iterate through all existing tenants/branches/endpoints, but that means some amount of code to do that properly and most likely we will end up with some per-metric hacks in the collector to cut out some of the tenants that are surely not changing that metric.
|
||||
|
||||
With the push model, it is easier to publish data only about actively changing metrics -- pageserver knows when it performs s3 offloads, garbage collection and starts/stops consuming data from the safekeeper; proxy knows about connected clients; console / autoscaler-agent knows about active cpu time.
|
||||
|
||||
Hence, let's go with a push-based model.
|
||||
|
||||
## Common bus vs. proxying through the console
|
||||
|
||||
We can implement such push systems in a few ways:
|
||||
|
||||
a. Each component pushes its metrics to the "common bus", namely segment, Kafka, or something similar. That approach scales well, but it would be harder to test it locally, will introduce new dependencies, we will have to distribute secrets for that connection to all of the components, etc. We would also have to loop back some of the events and their aggregates to the console, as we want to show some that metrics to the user in real-time.
|
||||
|
||||
b. Each component can call HTTP `POST` with its events to the console, and the console can forward it to the segment for later integration with metronome / orb / onebill / etc. With that approach, only the console has to speak with segment. Also since that data passes through the console, the console can save the latest metrics values, so there is no need for constant feedback of that events back from the segment.
|
||||
|
||||
# Implementation
|
||||
|
||||
Each (proxy|pageserver|autoscaler-agent) sends consumption events to the single endpoint in the console:
|
||||
|
||||
```json
|
||||
POST /usage_events HTTP/1.1
|
||||
Content-Type: application/json
|
||||
|
||||
[
|
||||
{
|
||||
"metric": "remote_storage_size",
|
||||
"type": "absolute",
|
||||
"time": "2022-12-28T11:07:19.317310284Z",
|
||||
"idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
"value": 12345454,
|
||||
"tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
"timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
||||
},
|
||||
...
|
||||
]
|
||||
```
|
||||
|
||||

|
||||
|
||||
Events could be either:
|
||||
* `incremental` -- change in consumption since the previous event or service restart. That is `effective_cpu_seconds`, `traffic_in_bytes`, and `traffic_out_bytes`.
|
||||
* `absolute` -- that is the current value of a metric. All of the size-related metrics are absolute.
|
||||
|
||||
Each service can post events at its own pace and bundle together data from different tenants/endpoints.
|
||||
|
||||
The console algorithm upon receive of events could be the following:
|
||||
|
||||
1. Create and send a segment event with the same content (possibly enriching it with tenant/timeline data for endpoint-based events).
|
||||
2. Update the latest state of per-tenant and per-endpoint metrics in the database.
|
||||
3. Check whether any of that metrics is above the allowed threshold and stop the project if necessary.
|
||||
|
||||
Since all the data comes in batches, we can do the batch update to reduce the number of queries in the database. Proxy traffic is probably the most frequent metric, so with batching, we will have extra `number_of_proxies` requests to the database each minute. This is most likely fine for now but will generate many dead tuples in the console database. If that is the case, we can change step 2 to the following:
|
||||
|
||||
2.1. Check if there $tenant_$metric / $endpoint_$metric key in Redis
|
||||
|
||||
2.2. If no stored value is found and the metric is incremental, then fetch the current value from DWH (which keeps aggregated value for all the events) and publish it.
|
||||
|
||||
2.3. Publish a new value (absolute metric) or add an increment to the stored value (incremental metric)
|
||||
|
||||
## Consumption watchdog
|
||||
|
||||
Since all the data goes through the console, we don't have to run any background thread/coroutines to check whether consumption is within the allowed limits. We only change consumption with `POST /usage_events`, so limit checks could be applied in the same handler.
|
||||
|
||||
## Extensibility
|
||||
|
||||
If we need to add a new metric (e.g. s3 traffic or something else), the console code should, by default, process it and publish segment event, even if the metric name is unknown to the console.
|
||||
|
||||
## Naming & schema
|
||||
|
||||
Each metric name should end up with units -- now `_seconds` and `_bytes`, and segment event should always have `tenant_id` and `timeline_id`/`endpoint_id` where applicable.
|
||||
BIN
docs/rfcs/images/metering.jpg
Normal file
BIN
docs/rfcs/images/metering.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 232 KiB |
@@ -18,10 +18,6 @@ Intended to be used in integration tests and in CLI tools for local installation
|
||||
Documentation of the Neon features and concepts.
|
||||
Now it is mostly dev documentation.
|
||||
|
||||
`/monitoring`:
|
||||
|
||||
TODO
|
||||
|
||||
`/pageserver`:
|
||||
|
||||
Neon storage service.
|
||||
@@ -98,6 +94,13 @@ cargo hakari manage-deps
|
||||
|
||||
If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
|
||||
|
||||
### Checking Rust 3rd-parties
|
||||
[Cargo deny](https://embarkstudios.github.io/cargo-deny/index.html) is a cargo plugin that lets us lint project's dependency graph to ensure all dependencies conform to requirements. It detects security issues, matches licenses, and ensures crates only come from trusted sources.
|
||||
|
||||
```bash
|
||||
cargo deny check
|
||||
```
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
|
||||
16
libs/consumption_metrics/Cargo.toml
Normal file
16
libs/consumption_metrics/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "consumption_metrics"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.68"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
|
||||
rand = "0.8.3"
|
||||
serde = "1.0.152"
|
||||
serde_with = "2.1.0"
|
||||
utils = { version = "0.1.0", path = "../utils" }
|
||||
workspace_hack = { version = "0.1.0", path = "../../workspace_hack" }
|
||||
50
libs/consumption_metrics/src/lib.rs
Normal file
50
libs/consumption_metrics/src/lib.rs
Normal file
@@ -0,0 +1,50 @@
|
||||
//!
|
||||
//! Shared code for consumption metics collection
|
||||
//!
|
||||
use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use serde::Serialize;
|
||||
|
||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[serde(tag = "type")]
|
||||
pub enum EventType {
|
||||
#[serde(rename = "absolute")]
|
||||
Absolute { time: DateTime<Utc> },
|
||||
#[serde(rename = "incremental")]
|
||||
Incremental {
|
||||
start_time: DateTime<Utc>,
|
||||
stop_time: DateTime<Utc>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
pub struct Event<Extra> {
|
||||
#[serde(flatten)]
|
||||
#[serde(rename = "type")]
|
||||
pub kind: EventType,
|
||||
|
||||
pub metric: &'static str,
|
||||
pub idempotency_key: String,
|
||||
pub value: u64,
|
||||
|
||||
#[serde(flatten)]
|
||||
pub extra: Extra,
|
||||
}
|
||||
|
||||
pub fn idempotency_key(node_id: String) -> String {
|
||||
format!(
|
||||
"{}-{}-{:04}",
|
||||
Utc::now(),
|
||||
node_id,
|
||||
rand::thread_rng().gen_range(0..=9999)
|
||||
)
|
||||
}
|
||||
|
||||
pub const CHUNK_SIZE: usize = 1000;
|
||||
|
||||
// Just a wrapper around a slice of events
|
||||
// to serialize it as `{"events" : [ ] }
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct EventChunk<'a, T> {
|
||||
pub events: &'a [T],
|
||||
}
|
||||
@@ -1,11 +1,12 @@
|
||||
[package]
|
||||
name = "metrics"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
once_cell = "1.13.0"
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
prometheus.workspace = true
|
||||
libc.workspace = true
|
||||
once_cell.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
[package]
|
||||
name = "pageserver_api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "2.0"
|
||||
const_format = "0.2.21"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
const_format.workspace = true
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
utils.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
|
||||
utils = { path = "../utils" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::num::NonZeroU64;
|
||||
use std::num::{NonZeroU64, NonZeroUsize};
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -210,6 +210,11 @@ pub struct TimelineInfo {
|
||||
pub state: TimelineState,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct DownloadRemoteLayersTaskSpawnRequest {
|
||||
pub max_concurrent_downloads: NonZeroUsize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct DownloadRemoteLayersTaskInfo {
|
||||
pub task_id: String,
|
||||
|
||||
@@ -1,18 +1,17 @@
|
||||
[package]
|
||||
name = "postgres_connection"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
itertools = "0.10.3"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
url = "2.2.2"
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
anyhow.workspace = true
|
||||
itertools.workspace = true
|
||||
postgres.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
url.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell = "1.13.0"
|
||||
once_cell.workspace = true
|
||||
|
||||
@@ -1,30 +1,31 @@
|
||||
[package]
|
||||
name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
hex = "0.4.3"
|
||||
once_cell = "1.13.0"
|
||||
log = "0.4.14"
|
||||
memoffset = "0.7"
|
||||
thiserror = "1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
anyhow.workspace = true
|
||||
crc32c.workspace = true
|
||||
hex.workspace = true
|
||||
once_cell.workspace = true
|
||||
log.workspace = true
|
||||
memoffset.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger = "0.9"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
env_logger.workspace = true
|
||||
postgres.workspace = true
|
||||
wal_craft = { path = "wal_craft" }
|
||||
|
||||
[build-dependencies]
|
||||
anyhow = "1.0"
|
||||
bindgen = "0.61"
|
||||
anyhow.workspace = true
|
||||
bindgen.workspace = true
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
[package]
|
||||
name = "wal_craft"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "4.0"
|
||||
env_logger = "0.9"
|
||||
log = "0.4"
|
||||
once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres_ffi = { path = "../" }
|
||||
tempfile = "3.2"
|
||||
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
env_logger.workspace = true
|
||||
log.workspace = true
|
||||
once_cell.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
tempfile.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
[package]
|
||||
name = "pq_proto"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
bytes = "1.0.1"
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
rand = "0.8.3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tracing = "0.1"
|
||||
thiserror = "1.0"
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
rand.workspace = true
|
||||
serde.workspace = true
|
||||
tokio.workspace = true
|
||||
tracing.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,28 +1,28 @@
|
||||
[package]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-trait = "0.1"
|
||||
metrics = { version = "0.1", path = "../metrics" }
|
||||
utils = { version = "0.1", path = "../utils" }
|
||||
once_cell = "1.13.0"
|
||||
aws-smithy-http = "0.51.0"
|
||||
aws-types = "0.51.0"
|
||||
aws-config = { version = "0.51.0", default-features = false, features=["rustls"] }
|
||||
aws-sdk-s3 = "0.21.0"
|
||||
hyper = { version = "0.14", features = ["stream"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
tracing = "0.1.27"
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
once_cell.workspace = true
|
||||
aws-smithy-http.workspace = true
|
||||
aws-types.workspace = true
|
||||
aws-config.workspace = true
|
||||
aws-sdk-s3.workspace = true
|
||||
hyper = { workspace = true, features = ["stream"] }
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2"
|
||||
tempfile.workspace = true
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
[package]
|
||||
name = "safekeeper_api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "2.0"
|
||||
const_format = "0.2.21"
|
||||
serde.workspace = true
|
||||
serde_with.workspace = true
|
||||
const_format.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
[package]
|
||||
name = "tenant_size_model"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
edition.workspace = true
|
||||
publish = false
|
||||
license = "Apache-2.0"
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
anyhow.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
/// Pricing model or history size builder.
|
||||
///
|
||||
/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
|
||||
@@ -134,7 +136,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
size: Option<u64>,
|
||||
) where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
let lastseg_id = *self.branches.get(branch).unwrap();
|
||||
let newseg_id = self.segments.len();
|
||||
@@ -214,20 +216,24 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
|
||||
}
|
||||
|
||||
/// Panics if the parent branch cannot be found.
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K)
|
||||
pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
|
||||
where
|
||||
K: std::borrow::Borrow<Q>,
|
||||
Q: std::hash::Hash + Eq,
|
||||
K: std::borrow::Borrow<Q> + std::fmt::Debug,
|
||||
Q: std::hash::Hash + Eq + std::fmt::Debug,
|
||||
{
|
||||
// Find the right segment
|
||||
let branchseg_id = *self
|
||||
.branches
|
||||
.get(parent)
|
||||
.expect("should had found the parent by key");
|
||||
let branchseg_id = *self.branches.get(parent).with_context(|| {
|
||||
format!(
|
||||
"should had found the parent {:?} by key. in branches {:?}",
|
||||
parent, self.branches
|
||||
)
|
||||
})?;
|
||||
|
||||
let _branchseg = &mut self.segments[branchseg_id];
|
||||
|
||||
// Create branch name for it
|
||||
self.branches.insert(name, branchseg_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
|
||||
|
||||
@@ -38,7 +38,7 @@ fn scenario_2() -> (Vec<Segment>, SegmentSize) {
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.branch("main", "child").unwrap();
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
@@ -63,7 +63,7 @@ fn scenario_3() -> (Vec<Segment>, SegmentSize) {
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.branch("main", "child").unwrap();
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
@@ -90,7 +90,7 @@ fn scenario_4() -> (Vec<Segment>, SegmentSize) {
|
||||
}
|
||||
|
||||
// Branch
|
||||
storage.branch("main", "child");
|
||||
storage.branch("main", "child").unwrap();
|
||||
storage.update("child", 1_000);
|
||||
|
||||
// More updates on parent
|
||||
@@ -106,10 +106,10 @@ fn scenario_4() -> (Vec<Segment>, SegmentSize) {
|
||||
fn scenario_5() -> (Vec<Segment>, SegmentSize) {
|
||||
let mut storage = Storage::new("a");
|
||||
storage.insert("a", 5000);
|
||||
storage.branch("a", "b");
|
||||
storage.branch("a", "b").unwrap();
|
||||
storage.update("b", 4000);
|
||||
storage.update("a", 2000);
|
||||
storage.branch("a", "c");
|
||||
storage.branch("a", "c").unwrap();
|
||||
storage.insert("c", 4000);
|
||||
storage.insert("a", 2000);
|
||||
|
||||
@@ -133,12 +133,12 @@ fn scenario_6() -> (Vec<Segment>, SegmentSize) {
|
||||
|
||||
let mut storage = Storage::new(None);
|
||||
|
||||
storage.branch(&None, branches[0]); // at 0
|
||||
storage.branch(&None, branches[0]).unwrap(); // at 0
|
||||
storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
|
||||
storage.branch(&branches[0], branches[1]); // at 108951064
|
||||
storage.branch(&branches[0], branches[1]).unwrap(); // at 108951064
|
||||
storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
|
||||
storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
|
||||
storage.branch(&branches[0], branches[2]); // at 283415424
|
||||
storage.branch(&branches[0], branches[2]).unwrap(); // at 283415424
|
||||
storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
|
||||
storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
|
||||
|
||||
|
||||
@@ -1,48 +1,49 @@
|
||||
[package]
|
||||
name = "utils"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
sentry = { version = "0.29.0", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
|
||||
async-trait = "0.1"
|
||||
anyhow = "1.0"
|
||||
bincode = "1.3"
|
||||
bytes = "1.0.1"
|
||||
hyper = { version = "0.14.7", features = ["full"] }
|
||||
routerify = "3"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
thiserror = "1.0"
|
||||
tokio = { version = "1.17", features = ["macros"]}
|
||||
tokio-rustls = "0.23"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
|
||||
nix = "0.25"
|
||||
signal-hook = "0.3.10"
|
||||
rand = "0.8.3"
|
||||
jsonwebtoken = "8"
|
||||
hex = { version = "0.4.3", features = ["serde"] }
|
||||
rustls = "0.20.2"
|
||||
rustls-split = "0.3.0"
|
||||
git-version = "0.3.5"
|
||||
serde_with = "2.0"
|
||||
once_cell = "1.13.0"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
sentry.workspace = true
|
||||
async-trait.workspace = true
|
||||
anyhow.workspace = true
|
||||
bincode.workspace = true
|
||||
bytes.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
routerify.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json"] }
|
||||
nix.workspace = true
|
||||
signal-hook.workspace = true
|
||||
rand.workspace = true
|
||||
jsonwebtoken.workspace = true
|
||||
hex = { workspace = true, features = ["serde"] }
|
||||
rustls.workspace = true
|
||||
rustls-split.workspace = true
|
||||
git-version.workspace = true
|
||||
serde_with.workspace = true
|
||||
once_cell.workspace = true
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
|
||||
metrics = { path = "../metrics" }
|
||||
pq_proto = { path = "../pq_proto" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
metrics.workspace = true
|
||||
pq_proto.workspace = true
|
||||
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
criterion = "0.4"
|
||||
rustls-pemfile = "1"
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tempfile.workspace = true
|
||||
criterion.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
@@ -7,12 +7,12 @@ use crate::postgres_backend::AuthType;
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use pq_proto::{BeMessage, ConnectionError, FeMessage, FeStartupPacket, SQLSTATE_INTERNAL_ERROR};
|
||||
use std::future::Future;
|
||||
use std::io;
|
||||
use std::net::SocketAddr;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::Poll;
|
||||
use std::{future::Future, task::ready};
|
||||
use tracing::{debug, error, info, trace};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufReader};
|
||||
@@ -253,12 +253,9 @@ impl PostgresBackend {
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
while self.buf_out.has_remaining() {
|
||||
match Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk()) {
|
||||
Poll::Ready(Ok(bytes_written)) => {
|
||||
self.buf_out.advance(bytes_written);
|
||||
}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
match ready!(Pin::new(&mut self.stream).poll_write(cx, self.buf_out.chunk())) {
|
||||
Ok(bytes_written) => self.buf_out.advance(bytes_written),
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
}
|
||||
Poll::Ready(Ok(()))
|
||||
@@ -573,10 +570,9 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
||||
// It's not strictly required to flush between each message, but makes it easier
|
||||
// to view in wireshark, and usually the messages that the callers write are
|
||||
// decently-sized anyway.
|
||||
match this.pgb.poll_write_buf(cx) {
|
||||
Poll::Ready(Ok(())) => {}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
||||
Ok(()) => {}
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
|
||||
// CopyData
|
||||
@@ -593,10 +589,9 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
match this.pgb.poll_write_buf(cx) {
|
||||
Poll::Ready(Ok(())) => {}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
||||
Ok(()) => {}
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
@@ -605,10 +600,9 @@ impl<'a> AsyncWrite for CopyDataWriter<'a> {
|
||||
cx: &mut std::task::Context<'_>,
|
||||
) -> Poll<Result<(), std::io::Error>> {
|
||||
let this = self.get_mut();
|
||||
match this.pgb.poll_write_buf(cx) {
|
||||
Poll::Ready(Ok(())) => {}
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
match ready!(this.pgb.poll_write_buf(cx)) {
|
||||
Ok(()) => {}
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
}
|
||||
this.pgb.poll_flush(cx)
|
||||
}
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
[package]
|
||||
name = "pageserver"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
default = []
|
||||
@@ -11,68 +11,68 @@ default = []
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
amplify_num = { git = "https://github.com/hlinnaka/rust-amplify.git", branch = "unsigned-int-perf" }
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-stream = "0.3"
|
||||
async-trait = "0.1"
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
chrono = { version = "0.4.23", default-features = false, features = ["clock", "serde"] }
|
||||
clap = { version = "4.0", features = ["string"] }
|
||||
close_fds = "0.3.2"
|
||||
const_format = "0.2.21"
|
||||
crc32c = "0.6.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
futures = "0.3.13"
|
||||
git-version = "0.3.5"
|
||||
hex = "0.4.3"
|
||||
humantime = "2.1.0"
|
||||
humantime-serde = "1.1.1"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
nix = "0.25"
|
||||
num-traits = "0.2.15"
|
||||
once_cell = "1.13.0"
|
||||
pin-project-lite = "0.2.7"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
rstar = "0.9.3"
|
||||
scopeguard = "1.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = { version = "1.0", features = ["raw_value"] }
|
||||
serde_with = "2.0"
|
||||
signal-hook = "0.3.10"
|
||||
svg_fmt = "0.4.1"
|
||||
tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
|
||||
thiserror = "1.0"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-util = { version = "0.7.3", features = ["io", "io-util"] }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
tracing = "0.1.36"
|
||||
url = "2"
|
||||
walkdir = "2.3.2"
|
||||
|
||||
metrics = { path = "../libs/metrics" }
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_connection = { path = "../libs/postgres_connection" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
pq_proto = { path = "../libs/pq_proto" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||
tenant_size_model = { path = "../libs/tenant_size_model" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
|
||||
amplify_num.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-stream.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
chrono = { workspace = true, features = ["serde"] }
|
||||
clap = { workspace = true, features = ["string"] }
|
||||
close_fds.workspace = true
|
||||
const_format.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
crc32c.workspace = true
|
||||
crossbeam-utils.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
hyper.workspace = true
|
||||
itertools.workspace = true
|
||||
nix.workspace = true
|
||||
num-traits.workspace = true
|
||||
once_cell.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
postgres-types.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
rstar.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json = { workspace = true, features = ["raw_value"] }
|
||||
serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
svg_fmt.workspace = true
|
||||
tokio-tar.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
|
||||
tokio-postgres.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_connection.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
storage_broker.workspace = true
|
||||
tenant_size_model.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack.workspace = true
|
||||
reqwest.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.4"
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tempfile.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -30,33 +30,44 @@ fn redo_scenarios(c: &mut Criterion) {
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_id = TenantId::generate();
|
||||
// std::fs::create_dir_all(conf.tenant_path(&tenant_id)).unwrap();
|
||||
let mut manager = PostgresRedoManager::new(conf, tenant_id);
|
||||
manager.launch_process(14).unwrap();
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
tracing::info!("executing first");
|
||||
short().execute(&manager).unwrap();
|
||||
tracing::info!("first executed");
|
||||
|
||||
let thread_counts = [1, 2, 4, 8, 16];
|
||||
|
||||
for thread_count in thread_counts {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new("short-50record", thread_count),
|
||||
&thread_count,
|
||||
|b, thread_count| {
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short, 50);
|
||||
},
|
||||
);
|
||||
}
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
|
||||
for thread_count in thread_counts {
|
||||
c.bench_with_input(
|
||||
BenchmarkId::new("medium-10record", thread_count),
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("short", thread_count),
|
||||
&thread_count,
|
||||
|b, thread_count| {
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium, 10);
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, short);
|
||||
},
|
||||
);
|
||||
}
|
||||
drop(group);
|
||||
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.sampling_mode(criterion::SamplingMode::Flat);
|
||||
|
||||
for thread_count in thread_counts {
|
||||
group.bench_with_input(
|
||||
BenchmarkId::new("medium", thread_count),
|
||||
&thread_count,
|
||||
|b, thread_count| {
|
||||
add_multithreaded_walredo_requesters(b, *thread_count, &manager, medium);
|
||||
},
|
||||
);
|
||||
}
|
||||
drop(group);
|
||||
}
|
||||
|
||||
/// Sets up `threads` number of requesters to `request_redo`, with the given input.
|
||||
@@ -65,46 +76,66 @@ fn add_multithreaded_walredo_requesters(
|
||||
threads: u32,
|
||||
manager: &Arc<PostgresRedoManager>,
|
||||
input_factory: fn() -> Request,
|
||||
request_repeats: usize,
|
||||
) {
|
||||
b.iter_batched_ref(
|
||||
|| {
|
||||
// barrier for all of the threads, and the benchmarked thread
|
||||
let barrier = Arc::new(Barrier::new(threads as usize + 1));
|
||||
assert_ne!(threads, 0);
|
||||
|
||||
let jhs = (0..threads)
|
||||
.map(|_| {
|
||||
std::thread::spawn({
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
move || {
|
||||
let input = std::iter::repeat(input_factory())
|
||||
.take(request_repeats)
|
||||
.collect::<Vec<_>>();
|
||||
if threads == 1 {
|
||||
b.iter_batched_ref(
|
||||
|| Some(input_factory()),
|
||||
|input| execute_all(input.take(), manager),
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
} else {
|
||||
let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
|
||||
|
||||
barrier.wait();
|
||||
let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
|
||||
|
||||
execute_all(input, &manager).unwrap();
|
||||
let barrier = Arc::new(Barrier::new(threads as usize + 1));
|
||||
|
||||
barrier.wait();
|
||||
let jhs = (0..threads)
|
||||
.map(|_| {
|
||||
std::thread::spawn({
|
||||
let manager = manager.clone();
|
||||
let barrier = barrier.clone();
|
||||
let work_rx = work_rx.clone();
|
||||
move || loop {
|
||||
// queue up and wait if we want to go another round
|
||||
if work_rx.lock().unwrap().recv().is_err() {
|
||||
break;
|
||||
}
|
||||
})
|
||||
|
||||
let input = Some(input_factory());
|
||||
|
||||
barrier.wait();
|
||||
|
||||
execute_all(input, &manager).unwrap();
|
||||
|
||||
barrier.wait();
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
(barrier, JoinOnDrop(jhs))
|
||||
},
|
||||
|input| {
|
||||
let barrier = &input.0;
|
||||
let _jhs = JoinOnDrop(jhs);
|
||||
|
||||
// start the work
|
||||
barrier.wait();
|
||||
b.iter_batched(
|
||||
|| {
|
||||
for _ in 0..threads {
|
||||
work_tx.send(()).unwrap()
|
||||
}
|
||||
},
|
||||
|()| {
|
||||
// start the work
|
||||
barrier.wait();
|
||||
|
||||
// wait for work to complete
|
||||
barrier.wait();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
// wait for work to complete
|
||||
barrier.wait();
|
||||
},
|
||||
criterion::BatchSize::PerIteration,
|
||||
);
|
||||
|
||||
drop(work_tx);
|
||||
}
|
||||
}
|
||||
|
||||
struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
|
||||
@@ -121,7 +152,10 @@ impl Drop for JoinOnDrop {
|
||||
}
|
||||
}
|
||||
|
||||
fn execute_all(input: Vec<Request>, manager: &PostgresRedoManager) -> Result<(), WalRedoError> {
|
||||
fn execute_all<I>(input: I, manager: &PostgresRedoManager) -> Result<(), WalRedoError>
|
||||
where
|
||||
I: IntoIterator<Item = Request>,
|
||||
{
|
||||
// just fire all requests as fast as possible
|
||||
input.into_iter().try_for_each(|req| {
|
||||
let page = req.execute(manager)?;
|
||||
@@ -143,6 +177,7 @@ macro_rules! lsn {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Short payload, 1132 bytes.
|
||||
// pg_records are copypasted from log, where they are put with Debug impl of Bytes, which uses \0
|
||||
// for null bytes.
|
||||
#[allow(clippy::octal_escapes)]
|
||||
@@ -172,6 +207,7 @@ fn short() -> Request {
|
||||
}
|
||||
}
|
||||
|
||||
/// Medium sized payload, serializes as 26393 bytes.
|
||||
// see [`short`]
|
||||
#[allow(clippy::octal_escapes)]
|
||||
fn medium() -> Request {
|
||||
|
||||
BIN
pageserver/fixtures/short_v14_redo.page
Normal file
BIN
pageserver/fixtures/short_v14_redo.page
Normal file
Binary file not shown.
@@ -27,7 +27,7 @@ use tracing::*;
|
||||
///
|
||||
use tokio_tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::tenant::{with_ondemand_download, Timeline};
|
||||
use crate::tenant::Timeline;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
@@ -171,30 +171,23 @@ where
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in
|
||||
with_ondemand_download(|| self.timeline.list_slru_segments(kind, self.lsn)).await?
|
||||
{
|
||||
for segno in self.timeline.list_slru_segments(kind, self.lsn).await? {
|
||||
self.add_slru_segment(kind, segno).await?;
|
||||
}
|
||||
}
|
||||
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in
|
||||
with_ondemand_download(|| self.timeline.list_dbdirs(self.lsn)).await?
|
||||
{
|
||||
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn).await? {
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
|
||||
|
||||
// Gather and send relational files in each database if full backup is requested.
|
||||
if self.full_backup {
|
||||
for rel in
|
||||
with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
|
||||
.await?
|
||||
{
|
||||
for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn).await? {
|
||||
self.add_rel(rel).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
for xid in with_ondemand_download(|| self.timeline.list_twophase_files(self.lsn)).await? {
|
||||
for xid in self.timeline.list_twophase_files(self.lsn).await? {
|
||||
self.add_twophase_file(xid).await?;
|
||||
}
|
||||
|
||||
@@ -210,8 +203,7 @@ where
|
||||
}
|
||||
|
||||
async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
|
||||
let nblocks =
|
||||
with_ondemand_download(|| self.timeline.get_rel_size(tag, self.lsn, false)).await?;
|
||||
let nblocks = self.timeline.get_rel_size(tag, self.lsn, false).await?;
|
||||
|
||||
// If the relation is empty, create an empty file
|
||||
if nblocks == 0 {
|
||||
@@ -229,11 +221,10 @@ where
|
||||
|
||||
let mut segment_data: Vec<u8> = vec![];
|
||||
for blknum in startblk..endblk {
|
||||
let img = with_ondemand_download(|| {
|
||||
self.timeline
|
||||
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
|
||||
})
|
||||
.await?;
|
||||
let img = self
|
||||
.timeline
|
||||
.get_rel_page_at_lsn(tag, blknum, self.lsn, false)
|
||||
.await?;
|
||||
segment_data.extend_from_slice(&img[..]);
|
||||
}
|
||||
|
||||
@@ -252,17 +243,17 @@ where
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let nblocks =
|
||||
with_ondemand_download(|| self.timeline.get_slru_segment_size(slru, segno, self.lsn))
|
||||
.await?;
|
||||
let nblocks = self
|
||||
.timeline
|
||||
.get_slru_segment_size(slru, segno, self.lsn)
|
||||
.await?;
|
||||
|
||||
let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img = with_ondemand_download(|| {
|
||||
self.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
|
||||
})
|
||||
.await?;
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
|
||||
.await?;
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(img.len() == BLCKSZ as usize || img.len() == BLCKSZ as usize + 8);
|
||||
@@ -294,9 +285,10 @@ where
|
||||
has_relmap_file: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img =
|
||||
with_ondemand_download(|| self.timeline.get_relmap_file(spcnode, dbnode, self.lsn))
|
||||
.await?;
|
||||
let img = self
|
||||
.timeline
|
||||
.get_relmap_file(spcnode, dbnode, self.lsn)
|
||||
.await?;
|
||||
ensure!(img.len() == 512);
|
||||
Some(img)
|
||||
} else {
|
||||
@@ -329,7 +321,9 @@ where
|
||||
// XLOG_TBLSPC_DROP records. But we probably should just
|
||||
// throw an error on CREATE TABLESPACE in the first place.
|
||||
if !has_relmap_file
|
||||
&& with_ondemand_download(|| self.timeline.list_rels(spcnode, dbnode, self.lsn))
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn)
|
||||
.await?
|
||||
.is_empty()
|
||||
{
|
||||
@@ -362,7 +356,7 @@ where
|
||||
// Extract twophase state files
|
||||
//
|
||||
async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = with_ondemand_download(|| self.timeline.get_twophase_file(xid, self.lsn)).await?;
|
||||
let img = self.timeline.get_twophase_file(xid, self.lsn).await?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -398,10 +392,14 @@ where
|
||||
)
|
||||
.await?;
|
||||
|
||||
let checkpoint_bytes = with_ondemand_download(|| self.timeline.get_checkpoint(self.lsn))
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_checkpoint(self.lsn)
|
||||
.await
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = with_ondemand_download(|| self.timeline.get_control_file(self.lsn))
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_control_file(self.lsn)
|
||||
.await
|
||||
.context("failed get control bytes")?;
|
||||
|
||||
|
||||
@@ -336,6 +336,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
pageserver::consumption_metrics::collect_metrics(
|
||||
metric_collection_endpoint,
|
||||
conf.metric_collection_interval,
|
||||
conf.synthetic_size_calculation_interval,
|
||||
conf.id,
|
||||
)
|
||||
.instrument(info_span!("metrics_collection"))
|
||||
|
||||
@@ -59,6 +59,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
|
||||
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
|
||||
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -83,6 +85,7 @@ pub mod defaults {
|
||||
#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}'
|
||||
|
||||
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
|
||||
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
@@ -152,6 +155,7 @@ pub struct PageServerConf {
|
||||
// How often to collect metrics and send them to the metrics endpoint.
|
||||
pub metric_collection_interval: Duration,
|
||||
pub metric_collection_endpoint: Option<Url>,
|
||||
pub synthetic_size_calculation_interval: Duration,
|
||||
|
||||
pub test_remote_failures: u64,
|
||||
}
|
||||
@@ -215,6 +219,7 @@ struct PageServerConfigBuilder {
|
||||
|
||||
metric_collection_interval: BuilderValue<Duration>,
|
||||
metric_collection_endpoint: BuilderValue<Option<Url>>,
|
||||
synthetic_size_calculation_interval: BuilderValue<Duration>,
|
||||
|
||||
test_remote_failures: BuilderValue<u64>,
|
||||
}
|
||||
@@ -255,6 +260,10 @@ impl Default for PageServerConfigBuilder {
|
||||
DEFAULT_METRIC_COLLECTION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default metric collection interval")),
|
||||
synthetic_size_calculation_interval: Set(humantime::parse_duration(
|
||||
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
|
||||
)
|
||||
.expect("cannot parse default synthetic size calculation interval")),
|
||||
metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
|
||||
|
||||
test_remote_failures: Set(0),
|
||||
@@ -342,6 +351,14 @@ impl PageServerConfigBuilder {
|
||||
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
|
||||
}
|
||||
|
||||
pub fn synthetic_size_calculation_interval(
|
||||
&mut self,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
) {
|
||||
self.synthetic_size_calculation_interval =
|
||||
BuilderValue::Set(synthetic_size_calculation_interval)
|
||||
}
|
||||
|
||||
pub fn test_remote_failures(&mut self, fail_first: u64) {
|
||||
self.test_remote_failures = BuilderValue::Set(fail_first);
|
||||
}
|
||||
@@ -399,6 +416,9 @@ impl PageServerConfigBuilder {
|
||||
metric_collection_endpoint: self
|
||||
.metric_collection_endpoint
|
||||
.ok_or(anyhow!("missing metric_collection_endpoint"))?,
|
||||
synthetic_size_calculation_interval: self
|
||||
.synthetic_size_calculation_interval
|
||||
.ok_or(anyhow!("missing synthetic_size_calculation_interval"))?,
|
||||
test_remote_failures: self
|
||||
.test_remote_failures
|
||||
.ok_or(anyhow!("missing test_remote_failuers"))?,
|
||||
@@ -577,7 +597,8 @@ impl PageServerConf {
|
||||
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
|
||||
builder.metric_collection_endpoint(Some(endpoint));
|
||||
},
|
||||
|
||||
"synthetic_size_calculation_interval" =>
|
||||
builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
|
||||
"test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
@@ -701,6 +722,7 @@ impl PageServerConf {
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(60),
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
synthetic_size_calculation_interval: Duration::from_secs(60),
|
||||
test_remote_failures: 0,
|
||||
}
|
||||
}
|
||||
@@ -834,6 +856,7 @@ id = 10
|
||||
|
||||
metric_collection_interval = '222 s'
|
||||
metric_collection_endpoint = 'http://localhost:80/metrics'
|
||||
synthetic_size_calculation_interval = '333 s'
|
||||
log_format = 'json'
|
||||
|
||||
"#;
|
||||
@@ -880,6 +903,9 @@ log_format = 'json'
|
||||
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
|
||||
)?,
|
||||
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
|
||||
synthetic_size_calculation_interval: humantime::parse_duration(
|
||||
defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
|
||||
)?,
|
||||
test_remote_failures: 0,
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
@@ -926,6 +952,7 @@ log_format = 'json'
|
||||
concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(),
|
||||
metric_collection_interval: Duration::from_secs(222),
|
||||
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
|
||||
synthetic_size_calculation_interval: Duration::from_secs(333),
|
||||
test_remote_failures: 0,
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
|
||||
@@ -3,154 +3,74 @@
|
||||
//! and push them to a HTTP endpoint.
|
||||
//! Cache metrics to send only the updated ones.
|
||||
//!
|
||||
|
||||
use anyhow;
|
||||
use tracing::*;
|
||||
use utils::id::NodeId;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
|
||||
use crate::tenant::mgr;
|
||||
use anyhow;
|
||||
use chrono::Utc;
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use pageserver_api::models::TenantState;
|
||||
use utils::id::TenantId;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use reqwest::Url;
|
||||
use serde::Serialize;
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
|
||||
use chrono::{DateTime, Utc};
|
||||
use rand::Rng;
|
||||
use reqwest::Url;
|
||||
const WRITTEN_SIZE: &str = "written_size";
|
||||
const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
|
||||
const RESIDENT_SIZE: &str = "resident_size";
|
||||
const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
|
||||
const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
|
||||
|
||||
/// ConsumptionMetric struct that defines the format for one metric entry
|
||||
/// i.e.
|
||||
///
|
||||
/// ```json
|
||||
/// {
|
||||
/// "metric": "remote_storage_size",
|
||||
/// "type": "absolute",
|
||||
/// "tenant_id": "5d07d9ce9237c4cd845ea7918c0afa7d",
|
||||
/// "timeline_id": "a03ebb4f5922a1c56ff7485cc8854143",
|
||||
/// "time": "2022-12-28T11:07:19.317310284Z",
|
||||
/// "idempotency_key": "2022-12-28 11:07:19.317310324 UTC-1-4019",
|
||||
/// "value": 12345454,
|
||||
/// }
|
||||
/// ```
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
pub struct ConsumptionMetric {
|
||||
pub metric: ConsumptionMetricKind,
|
||||
#[serde(rename = "type")]
|
||||
pub metric_type: &'static str,
|
||||
#[derive(Serialize)]
|
||||
struct Ids {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub tenant_id: TenantId,
|
||||
tenant_id: TenantId,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub time: DateTime<Utc>,
|
||||
pub idempotency_key: String,
|
||||
pub value: u64,
|
||||
}
|
||||
|
||||
impl ConsumptionMetric {
|
||||
pub fn new_absolute<R: Rng + ?Sized>(
|
||||
metric: ConsumptionMetricKind,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
value: u64,
|
||||
node_id: NodeId,
|
||||
rng: &mut R,
|
||||
) -> Self {
|
||||
Self {
|
||||
metric,
|
||||
metric_type: "absolute",
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
time: Utc::now(),
|
||||
// key that allows metric collector to distinguish unique events
|
||||
idempotency_key: format!("{}-{}-{:04}", Utc::now(), node_id, rng.gen_range(0..=9999)),
|
||||
value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ConsumptionMetricKind {
|
||||
/// Amount of WAL produced , by a timeline, i.e. last_record_lsn
|
||||
/// This is an absolute, per-timeline metric.
|
||||
WrittenSize,
|
||||
/// Size of all tenant branches including WAL
|
||||
/// This is an absolute, per-tenant metric.
|
||||
/// This is the same metric that tenant/tenant_id/size endpoint returns.
|
||||
SyntheticStorageSize,
|
||||
/// Size of all the layer files in the tenant's directory on disk on the pageserver.
|
||||
/// This is an absolute, per-tenant metric.
|
||||
/// See also prometheus metric RESIDENT_PHYSICAL_SIZE.
|
||||
ResidentSize,
|
||||
/// Size of the remote storage (S3) directory.
|
||||
/// This is an absolute, per-tenant metric.
|
||||
RemoteStorageSize,
|
||||
/// Logical size of the data in the timeline
|
||||
/// This is an absolute, per-timeline metric
|
||||
TimelineLogicalSize,
|
||||
}
|
||||
|
||||
impl FromStr for ConsumptionMetricKind {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"written_size" => Ok(Self::WrittenSize),
|
||||
"synthetic_storage_size" => Ok(Self::SyntheticStorageSize),
|
||||
"resident_size" => Ok(Self::ResidentSize),
|
||||
"remote_storage_size" => Ok(Self::RemoteStorageSize),
|
||||
"timeline_logical_size" => Ok(Self::TimelineLogicalSize),
|
||||
_ => anyhow::bail!("invalid value \"{s}\" for metric type"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ConsumptionMetricKind {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.write_str(match self {
|
||||
ConsumptionMetricKind::WrittenSize => "written_size",
|
||||
ConsumptionMetricKind::SyntheticStorageSize => "synthetic_storage_size",
|
||||
ConsumptionMetricKind::ResidentSize => "resident_size",
|
||||
ConsumptionMetricKind::RemoteStorageSize => "remote_storage_size",
|
||||
ConsumptionMetricKind::TimelineLogicalSize => "timeline_logical_size",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct ConsumptionMetricsKey {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: Option<TimelineId>,
|
||||
metric: ConsumptionMetricKind,
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct EventChunk<'a> {
|
||||
events: &'a [ConsumptionMetric],
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct PageserverConsumptionMetricsKey {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: Option<TimelineId>,
|
||||
pub metric: &'static str,
|
||||
}
|
||||
|
||||
/// Main thread that serves metrics collection
|
||||
pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &Url,
|
||||
metric_collection_interval: Duration,
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
info!("starting collect_metrics");
|
||||
|
||||
// spin up background worker that caclulates tenant sizes
|
||||
task_mgr::spawn(
|
||||
BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::CalculateSyntheticSize,
|
||||
None,
|
||||
None,
|
||||
"synthetic size calculation",
|
||||
true,
|
||||
async move {
|
||||
calculate_synthetic_size_worker(synthetic_size_calculation_interval)
|
||||
.instrument(info_span!("synthetic_size_worker"))
|
||||
.await?;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<ConsumptionMetricsKey, u64> = HashMap::new();
|
||||
let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -159,7 +79,10 @@ pub async fn collect_metrics(
|
||||
return Ok(());
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
collect_metrics_task(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await?;
|
||||
if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await
|
||||
{
|
||||
error!("metrics collection failed: {err:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -169,15 +92,20 @@ pub async fn collect_metrics(
|
||||
///
|
||||
/// Gather per-tenant and per-timeline metrics and send them to the `metric_collection_endpoint`.
|
||||
/// Cache metrics to avoid sending the same metrics multiple times.
|
||||
pub async fn collect_metrics_task(
|
||||
///
|
||||
/// TODO
|
||||
/// - refactor this function (chunking+sending part) to reuse it in proxy module;
|
||||
/// - improve error handling. Now if one tenant fails to collect metrics,
|
||||
/// the whole iteration fails and metrics for other tenants are not collected.
|
||||
pub async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<ConsumptionMetricsKey, u64>,
|
||||
cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut current_metrics: Vec<(ConsumptionMetricsKey, u64)> = Vec::new();
|
||||
let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
|
||||
trace!(
|
||||
"starting collect_metrics_task. metric_collection_endpoint: {}",
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
@@ -201,10 +129,10 @@ pub async fn collect_metrics_task(
|
||||
let timeline_written_size = u64::from(timeline.get_last_record_lsn());
|
||||
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::WrittenSize,
|
||||
metric: WRITTEN_SIZE,
|
||||
},
|
||||
timeline_written_size,
|
||||
));
|
||||
@@ -213,10 +141,10 @@ pub async fn collect_metrics_task(
|
||||
// Only send timeline logical size when it is fully calculated.
|
||||
if is_exact {
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: Some(timeline.timeline_id),
|
||||
metric: ConsumptionMetricKind::TimelineLogicalSize,
|
||||
metric: TIMELINE_LOGICAL_SIZE,
|
||||
},
|
||||
timeline_logical_size,
|
||||
));
|
||||
@@ -234,24 +162,34 @@ pub async fn collect_metrics_task(
|
||||
);
|
||||
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: ConsumptionMetricKind::ResidentSize,
|
||||
metric: RESIDENT_SIZE,
|
||||
},
|
||||
tenant_resident_size,
|
||||
));
|
||||
|
||||
current_metrics.push((
|
||||
ConsumptionMetricsKey {
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: ConsumptionMetricKind::RemoteStorageSize,
|
||||
metric: REMOTE_STORAGE_SIZE,
|
||||
},
|
||||
tenant_remote_size,
|
||||
));
|
||||
|
||||
// TODO add SyntheticStorageSize metric
|
||||
// Note that this metric is calculated in a separate bgworker
|
||||
// Here we only use cached value, which may lag behind the real latest one
|
||||
let tenant_synthetic_size = tenant.get_cached_synthetic_size();
|
||||
current_metrics.push((
|
||||
PageserverConsumptionMetricsKey {
|
||||
tenant_id,
|
||||
timeline_id: None,
|
||||
metric: SYNTHETIC_STORAGE_SIZE,
|
||||
},
|
||||
tenant_synthetic_size,
|
||||
));
|
||||
}
|
||||
|
||||
// Filter metrics
|
||||
@@ -267,35 +205,29 @@ pub async fn collect_metrics_task(
|
||||
|
||||
// Send metrics.
|
||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||
const CHUNK_SIZE: usize = 1000;
|
||||
let chunks = current_metrics.chunks(CHUNK_SIZE);
|
||||
|
||||
let mut chunk_to_send: Vec<ConsumptionMetric> = Vec::with_capacity(1000);
|
||||
let mut chunk_to_send: Vec<Event<Ids>> = Vec::with_capacity(CHUNK_SIZE);
|
||||
|
||||
for chunk in chunks {
|
||||
chunk_to_send.clear();
|
||||
|
||||
// this code block is needed to convince compiler
|
||||
// that rng is not reused aroung await point
|
||||
{
|
||||
// enrich metrics with timestamp and metric_kind before sending
|
||||
let mut rng = rand::thread_rng();
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| {
|
||||
ConsumptionMetric::new_absolute(
|
||||
curr_key.metric,
|
||||
curr_key.tenant_id,
|
||||
curr_key.timeline_id,
|
||||
*curr_val,
|
||||
node_id,
|
||||
&mut rng,
|
||||
)
|
||||
}));
|
||||
}
|
||||
// enrich metrics with type,timestamp and idempotency key before sending
|
||||
chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
|
||||
kind: EventType::Absolute { time: Utc::now() },
|
||||
metric: curr_key.metric,
|
||||
idempotency_key: idempotency_key(node_id.to_string()),
|
||||
value: *curr_val,
|
||||
extra: Ids {
|
||||
tenant_id: curr_key.tenant_id,
|
||||
timeline_id: curr_key.timeline_id,
|
||||
},
|
||||
}));
|
||||
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk {
|
||||
events: &chunk_to_send,
|
||||
})
|
||||
.expect("ConsumptionMetric should not fail serialization");
|
||||
.expect("PageserverConsumptionMetric should not fail serialization");
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
@@ -322,3 +254,39 @@ pub async fn collect_metrics_task(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Caclculate synthetic size for each active tenant
|
||||
pub async fn calculate_synthetic_size_worker(
|
||||
synthetic_size_calculation_interval: Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("starting calculate_synthetic_size_worker");
|
||||
|
||||
let mut ticker = tokio::time::interval(synthetic_size_calculation_interval);
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
return Ok(());
|
||||
},
|
||||
_ = ticker.tick() => {
|
||||
|
||||
let tenants = mgr::list_tenants().await;
|
||||
// iterate through list of Active tenants and collect metrics
|
||||
for (tenant_id, tenant_state) in tenants {
|
||||
|
||||
if tenant_state != TenantState::Active {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
|
||||
{
|
||||
if let Err(e) = tenant.calculate_synthetic_size().await {
|
||||
error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::sync::Arc;
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
@@ -13,7 +14,7 @@ use super::models::{
|
||||
};
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::{with_ondemand_download, Timeline};
|
||||
use crate::tenant::{PageReconstructError, Timeline};
|
||||
use crate::{config::PageServerConf, tenant::mgr};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
@@ -77,6 +78,15 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
|
||||
})
|
||||
}
|
||||
|
||||
fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
|
||||
match err {
|
||||
PageReconstructError::Other(err) => ApiError::InternalServerError(err),
|
||||
PageReconstructError::WalRedo(err) => {
|
||||
ApiError::InternalServerError(anyhow::Error::new(err))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to construct a TimelineInfo struct for a timeline
|
||||
async fn build_timeline_info(
|
||||
timeline: &Arc<Timeline>,
|
||||
@@ -298,9 +308,10 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
|
||||
.await
|
||||
.and_then(|tenant| tenant.get_timeline(timeline_id, true))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
let result = with_ondemand_download(|| timeline.find_lsn_for_timestamp(timestamp_pg))
|
||||
let result = timeline
|
||||
.find_lsn_for_timestamp(timestamp_pg)
|
||||
.await
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
.map_err(apierror_from_prerror)?;
|
||||
|
||||
let result = match result {
|
||||
LsnForTimestamp::Present(lsn) => format!("{lsn}"),
|
||||
@@ -585,7 +596,7 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
// is Active when this function returns.
|
||||
if let res @ Err(_) = tenant.wait_to_become_active().await {
|
||||
// This shouldn't happen because we just created the tenant directory
|
||||
// in tenant_mgr::create_tenant, and there aren't any remote timelines
|
||||
// in tenant::mgr::create_tenant, and there aren't any remote timelines
|
||||
// to load, so, nothing can really fail during load.
|
||||
// Don't do cleanup because we don't know how we got here.
|
||||
// The tenant will likely be in `Broken` state and subsequent
|
||||
@@ -778,10 +789,11 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
|
||||
}
|
||||
|
||||
async fn timeline_download_remote_layers_handler_post(
|
||||
request: Request<Body>,
|
||||
mut request: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = mgr::get_tenant(tenant_id, true)
|
||||
@@ -790,7 +802,7 @@ async fn timeline_download_remote_layers_handler_post(
|
||||
let timeline = tenant
|
||||
.get_timeline(timeline_id, true)
|
||||
.map_err(ApiError::NotFound)?;
|
||||
match timeline.spawn_download_all_remote_layers().await {
|
||||
match timeline.spawn_download_all_remote_layers(body).await {
|
||||
Ok(st) => json_response(StatusCode::ACCEPTED, st),
|
||||
Err(st) => json_response(StatusCode::CONFLICT, st),
|
||||
}
|
||||
|
||||
@@ -143,7 +143,11 @@ async fn import_rel(
|
||||
// Call put_rel_creation for every segment of the relation,
|
||||
// because there is no guarantee about the order in which we are processing segments.
|
||||
// ignore "relation already exists" error
|
||||
if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) {
|
||||
//
|
||||
// FIXME: use proper error type for this, instead of parsing the error message.
|
||||
// Or better yet, keep track of which relations we've already created
|
||||
// https://github.com/neondatabase/neon/issues/3309
|
||||
if let Err(e) = modification.put_rel_creation(rel, nblocks as u32).await {
|
||||
if e.to_string().contains("already exists") {
|
||||
debug!("relation {} already exists. we must be extending it", rel);
|
||||
} else {
|
||||
@@ -178,7 +182,7 @@ async fn import_rel(
|
||||
//
|
||||
// If we process rel segments out of order,
|
||||
// put_rel_extend will skip the update.
|
||||
modification.put_rel_extend(rel, blknum)?;
|
||||
modification.put_rel_extend(rel, blknum).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -206,7 +210,9 @@ async fn import_slru(
|
||||
|
||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
|
||||
|
||||
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
||||
modification
|
||||
.put_slru_segment_creation(slru, segno, nblocks as u32)
|
||||
.await?;
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
@@ -492,7 +498,7 @@ async fn import_file(
|
||||
}
|
||||
"pg_filenode.map" => {
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes).await?;
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
@@ -515,7 +521,7 @@ async fn import_file(
|
||||
match file_name.as_ref() {
|
||||
"pg_filenode.map" => {
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes).await?;
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
@@ -545,7 +551,9 @@ async fn import_file(
|
||||
let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
|
||||
|
||||
let bytes = read_all_bytes(reader).await?;
|
||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
|
||||
modification
|
||||
.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))
|
||||
.await?;
|
||||
debug!("imported twophase file");
|
||||
} else if file_path.starts_with("pg_wal") {
|
||||
debug!("found wal file in base section. ignore it");
|
||||
|
||||
@@ -251,7 +251,6 @@ impl PageRequestMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PageServerHandler {
|
||||
_conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
@@ -546,10 +545,7 @@ impl PageServerHandler {
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
|
||||
let exists = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_rel_exists(req.rel, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let exists = timeline.get_rel_exists(req.rel, lsn, req.latest).await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
exists,
|
||||
@@ -566,10 +562,7 @@ impl PageServerHandler {
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
|
||||
let n_blocks = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_rel_size(req.rel, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest).await?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
@@ -586,10 +579,9 @@ impl PageServerHandler {
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
|
||||
.await?;
|
||||
|
||||
let total_blocks = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let total_blocks = timeline
|
||||
.get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
|
||||
.await?;
|
||||
let db_size = total_blocks as i64 * BLCKSZ as i64;
|
||||
|
||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||
@@ -615,10 +607,9 @@ impl PageServerHandler {
|
||||
}
|
||||
*/
|
||||
|
||||
let page = crate::tenant::with_ondemand_download(|| {
|
||||
timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
|
||||
})
|
||||
.await?;
|
||||
let page = timeline
|
||||
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
|
||||
.await?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
@@ -651,7 +642,7 @@ impl PageServerHandler {
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
pgb.flush().await?;
|
||||
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
// Send a tarball of the latest layer on the timeline
|
||||
{
|
||||
let mut writer = pgb.copyout_writer();
|
||||
basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
|
||||
|
||||
@@ -6,11 +6,10 @@
|
||||
//! walingest.rs handles a few things like implicit relation creation and extension.
|
||||
//! Clarify that)
|
||||
//!
|
||||
use super::tenant::PageReconstructResult;
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::tenant::{with_ondemand_download, Timeline};
|
||||
use crate::repository::*;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{repository::*, try_no_ondemand_download};
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -92,76 +91,80 @@ impl Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page version.
|
||||
pub fn get_rel_page_at_lsn(
|
||||
pub async fn get_rel_page_at_lsn(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
}
|
||||
|
||||
let nblocks = try_no_ondemand_download!(self.get_rel_size(tag, lsn, latest));
|
||||
let nblocks = self.get_rel_size(tag, lsn, latest).await?;
|
||||
if blknum >= nblocks {
|
||||
debug!(
|
||||
"read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
|
||||
tag, blknum, lsn, nblocks
|
||||
);
|
||||
return PageReconstructResult::Success(ZERO_PAGE.clone());
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(tag, blknum);
|
||||
self.get(key, lsn)
|
||||
self.get(key, lsn).await
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
pub fn get_db_size(
|
||||
pub async fn get_db_size(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<usize> {
|
||||
) -> Result<usize, PageReconstructError> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
let rels = try_no_ondemand_download!(self.list_rels(spcnode, dbnode, lsn));
|
||||
let rels = self.list_rels(spcnode, dbnode, lsn).await?;
|
||||
|
||||
for rel in rels {
|
||||
let n_blocks = try_no_ondemand_download!(self.get_rel_size(rel, lsn, latest));
|
||||
let n_blocks = self.get_rel_size(rel, lsn, latest).await?;
|
||||
total_blocks += n_blocks as usize;
|
||||
}
|
||||
PageReconstructResult::Success(total_blocks)
|
||||
Ok(total_blocks)
|
||||
}
|
||||
|
||||
/// Get size of a relation file
|
||||
pub fn get_rel_size(
|
||||
pub async fn get_rel_size(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
latest: bool,
|
||||
) -> PageReconstructResult<BlockNumber> {
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return PageReconstructResult::Success(nblocks);
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !try_no_ondemand_download!(self.get_rel_exists(tag, lsn, latest))
|
||||
&& !self.get_rel_exists(tag, lsn, latest).await?
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
// FSM, and smgrnblocks() on it immediately afterwards,
|
||||
// without extending it. Tolerate that by claiming that
|
||||
// any non-existent FSM fork has size 0.
|
||||
return PageReconstructResult::Success(0);
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
let key = rel_size_to_key(tag);
|
||||
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let mut buf = self.get(key, lsn).await?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
if latest {
|
||||
@@ -174,47 +177,49 @@ impl Timeline {
|
||||
// associated with most recent value of LSN.
|
||||
self.update_cached_rel_size(tag, lsn, nblocks);
|
||||
}
|
||||
PageReconstructResult::Success(nblocks)
|
||||
Ok(nblocks)
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
pub fn get_rel_exists(
|
||||
pub async fn get_rel_exists(
|
||||
&self,
|
||||
tag: RelTag,
|
||||
lsn: Lsn,
|
||||
_latest: bool,
|
||||
) -> PageReconstructResult<bool> {
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
if tag.relnode == 0 {
|
||||
return PageReconstructResult::from(anyhow::anyhow!("invalid relnode"));
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"invalid relnode"
|
||||
)));
|
||||
}
|
||||
|
||||
// first try to lookup relation in cache
|
||||
if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
return PageReconstructResult::Success(true);
|
||||
return Ok(true);
|
||||
}
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some();
|
||||
PageReconstructResult::Success(exists)
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
pub fn list_rels(
|
||||
pub async fn list_rels(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<HashSet<RelTag>> {
|
||||
) -> Result<HashSet<RelTag>, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn).await?;
|
||||
|
||||
match RelDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
@@ -226,53 +231,53 @@ impl Timeline {
|
||||
forknum: *forknum,
|
||||
}));
|
||||
|
||||
PageReconstructResult::Success(rels)
|
||||
Ok(rels)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
pub fn get_slru_page_at_lsn(
|
||||
pub async fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = slru_block_to_key(kind, segno, blknum);
|
||||
self.get(key, lsn)
|
||||
self.get(key, lsn).await
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub fn get_slru_segment_size(
|
||||
pub async fn get_slru_segment_size(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<BlockNumber> {
|
||||
) -> Result<BlockNumber, PageReconstructError> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf.get_u32_le())
|
||||
let mut buf = self.get(key, lsn).await?;
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
pub fn get_slru_segment_exists(
|
||||
pub async fn get_slru_segment_exists(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<bool> {
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn).await?;
|
||||
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => {
|
||||
let exists = dir.segments.get(&segno).is_some();
|
||||
PageReconstructResult::Success(exists)
|
||||
Ok(exists)
|
||||
}
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -283,10 +288,10 @@ impl Timeline {
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
pub fn find_lsn_for_timestamp(
|
||||
pub async fn find_lsn_for_timestamp(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
) -> PageReconstructResult<LsnForTimestamp> {
|
||||
) -> Result<LsnForTimestamp, PageReconstructError> {
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
let max_lsn = self.get_last_record_lsn();
|
||||
@@ -302,12 +307,14 @@ impl Timeline {
|
||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||
let mid = (high + low) / 2;
|
||||
|
||||
let cmp = try_no_ondemand_download!(self.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
));
|
||||
let cmp = self
|
||||
.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
)
|
||||
.await?;
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
@@ -319,15 +326,15 @@ impl Timeline {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
// just after importing a cluster.
|
||||
PageReconstructResult::Success(LsnForTimestamp::NoData(max_lsn))
|
||||
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
PageReconstructResult::Success(LsnForTimestamp::Future(max_lsn))
|
||||
Ok(LsnForTimestamp::Future(max_lsn))
|
||||
}
|
||||
(false, true) => {
|
||||
// Didn't find any commit timestamps smaller than the request
|
||||
PageReconstructResult::Success(LsnForTimestamp::Past(max_lsn))
|
||||
Ok(LsnForTimestamp::Past(max_lsn))
|
||||
}
|
||||
(true, true) => {
|
||||
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||
@@ -337,7 +344,7 @@ impl Timeline {
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
PageReconstructResult::Success(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -349,26 +356,21 @@ impl Timeline {
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
pub fn is_latest_commit_timestamp_ge_than(
|
||||
pub async fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
found_smaller: &mut bool,
|
||||
found_larger: &mut bool,
|
||||
) -> PageReconstructResult<bool> {
|
||||
for segno in try_no_ondemand_download!(self.list_slru_segments(SlruKind::Clog, probe_lsn)) {
|
||||
let nblocks = try_no_ondemand_download!(self.get_slru_segment_size(
|
||||
SlruKind::Clog,
|
||||
segno,
|
||||
probe_lsn
|
||||
));
|
||||
) -> Result<bool, PageReconstructError> {
|
||||
for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn).await? {
|
||||
let nblocks = self
|
||||
.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)
|
||||
.await?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page = try_no_ondemand_download!(self.get_slru_page_at_lsn(
|
||||
SlruKind::Clog,
|
||||
segno,
|
||||
blknum,
|
||||
probe_lsn
|
||||
));
|
||||
let clog_page = self
|
||||
.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)
|
||||
.await?;
|
||||
|
||||
if clog_page.len() == BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
@@ -377,76 +379,85 @@ impl Timeline {
|
||||
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return PageReconstructResult::Success(true);
|
||||
return Ok(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
PageReconstructResult::Success(false)
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub fn list_slru_segments(
|
||||
pub async fn list_slru_segments(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<HashSet<u32>> {
|
||||
) -> Result<HashSet<u32>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
let buf = self.get(key, lsn).await?;
|
||||
match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.segments),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Ok(dir) => Ok(dir.segments),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_relmap_file(
|
||||
pub async fn get_relmap_file(
|
||||
&self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
lsn: Lsn,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf)
|
||||
self.get(key, lsn).await
|
||||
}
|
||||
|
||||
pub fn list_dbdirs(&self, lsn: Lsn) -> PageReconstructResult<HashMap<(Oid, Oid), bool>> {
|
||||
pub async fn list_dbdirs(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let buf = try_no_ondemand_download!(self.get(DBDIR_KEY, lsn));
|
||||
let buf = self.get(DBDIR_KEY, lsn).await?;
|
||||
|
||||
match DbDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.dbdirs),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Ok(dir) => Ok(dir.dbdirs),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
pub async fn get_twophase_file(
|
||||
&self,
|
||||
xid: TransactionId,
|
||||
lsn: Lsn,
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
let key = twophase_file_key(xid);
|
||||
let buf = try_no_ondemand_download!(self.get(key, lsn));
|
||||
PageReconstructResult::Success(buf)
|
||||
let buf = self.get(key, lsn).await?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
pub fn list_twophase_files(&self, lsn: Lsn) -> PageReconstructResult<HashSet<TransactionId>> {
|
||||
pub async fn list_twophase_files(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
) -> Result<HashSet<TransactionId>, PageReconstructError> {
|
||||
// fetch directory entry
|
||||
let buf = try_no_ondemand_download!(self.get(TWOPHASEDIR_KEY, lsn));
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
|
||||
|
||||
match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
|
||||
Ok(dir) => PageReconstructResult::Success(dir.xids),
|
||||
Err(e) => PageReconstructResult::from(e),
|
||||
Ok(dir) => Ok(dir.xids),
|
||||
Err(e) => Err(PageReconstructError::from(e)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_control_file(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
self.get(CONTROLFILE_KEY, lsn)
|
||||
pub async fn get_control_file(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
|
||||
self.get(CONTROLFILE_KEY, lsn).await
|
||||
}
|
||||
|
||||
pub fn get_checkpoint(&self, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
self.get(CHECKPOINT_KEY, lsn)
|
||||
pub async fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
|
||||
self.get(CHECKPOINT_KEY, lsn).await
|
||||
}
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
@@ -460,20 +471,24 @@ impl Timeline {
|
||||
cancel: CancellationToken,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get_download(DBDIR_KEY, lsn).await?;
|
||||
let buf = self.get(DBDIR_KEY, lsn).await.context("read dbdir")?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
|
||||
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in
|
||||
crate::tenant::with_ondemand_download(|| self.list_rels(*spcnode, *dbnode, lsn))
|
||||
.await?
|
||||
for rel in self
|
||||
.list_rels(*spcnode, *dbnode, lsn)
|
||||
.await
|
||||
.context("list rels")?
|
||||
{
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CalculateLogicalSizeError::Cancelled);
|
||||
}
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self.get_download(relsize_key, lsn).await?;
|
||||
let mut buf = self
|
||||
.get(relsize_key, lsn)
|
||||
.await
|
||||
.context("read relation size of {rel:?}")?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
total_size += relsize as u64;
|
||||
@@ -494,7 +509,7 @@ impl Timeline {
|
||||
result.add_key(DBDIR_KEY);
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get_download(DBDIR_KEY, lsn).await?;
|
||||
let buf = self.get(DBDIR_KEY, lsn).await?;
|
||||
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
|
||||
|
||||
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
|
||||
@@ -503,15 +518,15 @@ impl Timeline {
|
||||
result.add_key(relmap_file_key(spcnode, dbnode));
|
||||
result.add_key(rel_dir_to_key(spcnode, dbnode));
|
||||
|
||||
let mut rels: Vec<RelTag> =
|
||||
with_ondemand_download(|| self.list_rels(spcnode, dbnode, lsn))
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut rels: Vec<RelTag> = self
|
||||
.list_rels(spcnode, dbnode, lsn)
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect();
|
||||
rels.sort_unstable();
|
||||
for rel in rels {
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self.get_download(relsize_key, lsn).await?;
|
||||
let mut buf = self.get(relsize_key, lsn).await?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
|
||||
@@ -527,13 +542,13 @@ impl Timeline {
|
||||
] {
|
||||
let slrudir_key = slru_dir_to_key(kind);
|
||||
result.add_key(slrudir_key);
|
||||
let buf = self.get_download(slrudir_key, lsn).await?;
|
||||
let buf = self.get(slrudir_key, lsn).await?;
|
||||
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
|
||||
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
|
||||
segments.sort_unstable();
|
||||
for segno in segments {
|
||||
let segsize_key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get_download(segsize_key, lsn).await?;
|
||||
let mut buf = self.get(segsize_key, lsn).await?;
|
||||
let segsize = buf.get_u32_le();
|
||||
|
||||
result.add_range(
|
||||
@@ -545,7 +560,7 @@ impl Timeline {
|
||||
|
||||
// Then pg_twophase
|
||||
result.add_key(TWOPHASEDIR_KEY);
|
||||
let buf = self.get_download(TWOPHASEDIR_KEY, lsn).await?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
|
||||
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
|
||||
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
|
||||
xids.sort_unstable();
|
||||
@@ -703,9 +718,14 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||
pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> anyhow::Result<()> {
|
||||
pub async fn put_relmap_file(
|
||||
&mut self,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory (if it doesn't exist already)
|
||||
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(DBDIR_KEY).await?;
|
||||
let mut dbdir = DbDirectory::des(&buf)?;
|
||||
|
||||
let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
|
||||
@@ -731,9 +751,13 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> anyhow::Result<()> {
|
||||
pub async fn put_twophase_file(
|
||||
&mut self,
|
||||
xid: TransactionId,
|
||||
img: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY).await?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
if !dir.xids.insert(xid) {
|
||||
anyhow::bail!("twophase file for xid {} already exists", xid);
|
||||
@@ -757,16 +781,16 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
|
||||
pub async fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
|
||||
let req_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
let total_blocks = self
|
||||
.tline
|
||||
.get_db_size(spcnode, dbnode, req_lsn, true)
|
||||
.no_ondemand_download()?;
|
||||
.await?;
|
||||
|
||||
// Remove entry from dbdir
|
||||
let buf = self.get(DBDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(DBDIR_KEY).await?;
|
||||
let mut dir = DbDirectory::des(&buf)?;
|
||||
if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
|
||||
let buf = DbDirectory::ser(&dir)?;
|
||||
@@ -789,11 +813,15 @@ impl<'a> DatadirModification<'a> {
|
||||
/// Create a relation fork.
|
||||
///
|
||||
/// 'nblocks' is the initial size.
|
||||
pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_creation(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
// It's possible that this is the first rel for this db in this
|
||||
// tablespace. Create the reldir entry for it if so.
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).no_ondemand_download()?)?;
|
||||
let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).await?)?;
|
||||
let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
|
||||
// Didn't exist. Update dbdir
|
||||
@@ -805,7 +833,7 @@ impl<'a> DatadirModification<'a> {
|
||||
RelDirectory::default()
|
||||
} else {
|
||||
// reldir already exists, fetch it
|
||||
RelDirectory::des(&self.get(rel_dir_key).no_ondemand_download()?)?
|
||||
RelDirectory::des(&self.get(rel_dir_key).await?)?
|
||||
};
|
||||
|
||||
// Add the new relation to the rel directory entry, and write it back
|
||||
@@ -833,17 +861,17 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Truncate relation
|
||||
pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_truncation(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
let last_lsn = self.tline.get_last_record_lsn();
|
||||
if self
|
||||
.tline
|
||||
.get_rel_exists(rel, last_lsn, true)
|
||||
.no_ondemand_download()?
|
||||
{
|
||||
if self.tline.get_rel_exists(rel, last_lsn, true).await? {
|
||||
let size_key = rel_size_to_key(rel);
|
||||
// Fetch the old size first
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
let old_size = self.get(size_key).await?.get_u32_le();
|
||||
|
||||
// Update the entry with the new size.
|
||||
let buf = nblocks.to_le_bytes();
|
||||
@@ -863,12 +891,16 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
/// Extend relation
|
||||
/// If new size is smaller, do nothing.
|
||||
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_extend(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
|
||||
// Put size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
let old_size = self.get(size_key).await?.get_u32_le();
|
||||
|
||||
// only extend relation here. never decrease the size
|
||||
if nblocks > old_size {
|
||||
@@ -884,12 +916,12 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// Drop a relation.
|
||||
pub fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
|
||||
pub async fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
|
||||
anyhow::ensure!(rel.relnode != 0, "invalid relnode");
|
||||
|
||||
// Remove it from the directory entry
|
||||
let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let buf = self.get(dir_key).await?;
|
||||
let mut dir = RelDirectory::des(&buf)?;
|
||||
|
||||
if dir.rels.remove(&(rel.relnode, rel.forknum)) {
|
||||
@@ -900,7 +932,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// update logical size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key).no_ondemand_download()?.get_u32_le();
|
||||
let old_size = self.get(size_key).await?.get_u32_le();
|
||||
self.pending_nblocks -= old_size as i64;
|
||||
|
||||
// Remove enty from relation size cache
|
||||
@@ -912,7 +944,7 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn put_slru_segment_creation(
|
||||
pub async fn put_slru_segment_creation(
|
||||
&mut self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
@@ -920,7 +952,7 @@ impl<'a> DatadirModification<'a> {
|
||||
) -> anyhow::Result<()> {
|
||||
// Add it to the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let buf = self.get(dir_key).await?;
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.insert(segno) {
|
||||
@@ -956,10 +988,10 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// This method is used for marking truncated SLRU files
|
||||
pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
pub async fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let dir_key = slru_dir_to_key(kind);
|
||||
let buf = self.get(dir_key).no_ondemand_download()?;
|
||||
let buf = self.get(dir_key).await?;
|
||||
let mut dir = SlruSegmentDirectory::des(&buf)?;
|
||||
|
||||
if !dir.segments.remove(&segno) {
|
||||
@@ -983,9 +1015,9 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
/// This method is used for marking truncated SLRU files
|
||||
pub fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
pub async fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
// Remove it from the directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY).no_ondemand_download()?;
|
||||
let buf = self.get(TWOPHASEDIR_KEY).await?;
|
||||
let mut dir = TwoPhaseDirectory::des(&buf)?;
|
||||
|
||||
if !dir.xids.remove(&xid) {
|
||||
@@ -1079,7 +1111,7 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// Internal helper functions to batch the modifications
|
||||
|
||||
fn get(&self, key: Key) -> PageReconstructResult<Bytes> {
|
||||
async fn get(&self, key: Key) -> Result<Bytes, PageReconstructError> {
|
||||
// Have we already updated the same key? Read the pending updated
|
||||
// version in that case.
|
||||
//
|
||||
@@ -1087,18 +1119,20 @@ impl<'a> DatadirModification<'a> {
|
||||
// value that has been removed, deletion only avoids leaking storage.
|
||||
if let Some(value) = self.pending_updates.get(&key) {
|
||||
if let Value::Image(img) = value {
|
||||
PageReconstructResult::Success(img.clone())
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
// Currently, we never need to read back a WAL record that we
|
||||
// inserted in the same "transaction". All the metadata updates
|
||||
// work directly with Images, and we never need to read actual
|
||||
// data pages. We could handle this if we had to, by calling
|
||||
// the walredo manager, but let's keep it simple for now.
|
||||
PageReconstructResult::from(anyhow::anyhow!("unexpected pending WAL record"))
|
||||
Err(PageReconstructError::from(anyhow::anyhow!(
|
||||
"unexpected pending WAL record"
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
|
||||
self.tline.get(key, lsn)
|
||||
self.tline.get(key, lsn).await
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -220,6 +220,8 @@ pub enum TaskKind {
|
||||
|
||||
// task that drives downloading layers
|
||||
DownloadAllRemoteLayers,
|
||||
// Task that calculates synthetis size for all active tenants
|
||||
CalculateSyntheticSize,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
|
||||
@@ -38,6 +38,8 @@ use std::path::Path;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::process::Stdio;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
use std::sync::MutexGuard;
|
||||
use std::sync::{Mutex, RwLock};
|
||||
@@ -92,7 +94,7 @@ mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
pub use timeline::{with_ondemand_download, PageReconstructError, PageReconstructResult, Timeline};
|
||||
pub use timeline::{PageReconstructError, Timeline};
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
@@ -139,6 +141,7 @@ pub struct Tenant {
|
||||
|
||||
/// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
|
||||
cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
/// A timeline with some of its files on disk, being initialized.
|
||||
@@ -438,8 +441,16 @@ struct RemoteStartupData {
|
||||
|
||||
impl Tenant {
|
||||
/// Yet another helper for timeline initialization.
|
||||
/// Contains common part for `load_local_timeline` and `load_remote_timeline`
|
||||
async fn setup_timeline(
|
||||
/// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
|
||||
///
|
||||
/// - Initializes the Timeline struct and inserts it into the tenant's hash map
|
||||
/// - Scans the local timeline directory for layer files and builds the layer map
|
||||
/// - Downloads remote index file and adds remote files to the layer map
|
||||
/// - Schedules remote upload tasks for any files that are present locally but missing from remote storage.
|
||||
///
|
||||
/// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
|
||||
/// it is marked as Active.
|
||||
async fn timeline_init_and_sync(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
remote_client: Option<RemoteTimelineClient>,
|
||||
@@ -482,10 +493,7 @@ impl Tenant {
|
||||
// But we shouldnt start walreceiver before we have all the data locally, because working walreceiver
|
||||
// will ingest data which may require looking at the layers which are not yet available locally
|
||||
match timeline.initialize_with_lock(&mut timelines_accessor, true, false) {
|
||||
Ok(initialized_timeline) => {
|
||||
timelines_accessor.insert(timeline_id, initialized_timeline.clone());
|
||||
Ok(initialized_timeline)
|
||||
}
|
||||
Ok(new_timeline) => new_timeline,
|
||||
Err(e) => {
|
||||
error!("Failed to initialize timeline {tenant_id}/{timeline_id}: {e:?}");
|
||||
// FIXME using None is a hack, it wont hurt, just ugly.
|
||||
@@ -501,16 +509,14 @@ impl Tenant {
|
||||
None,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to crate broken timeline data for {tenant_id}/{timeline_id}"
|
||||
)
|
||||
format!("creating broken timeline data for {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
broken_timeline.set_state(TimelineState::Broken);
|
||||
timelines_accessor.insert(timeline_id, broken_timeline);
|
||||
Err(e)
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}?;
|
||||
};
|
||||
|
||||
if self.remote_storage.is_some() {
|
||||
// Reconcile local state with remote storage, downloading anything that's
|
||||
@@ -612,7 +618,7 @@ impl Tenant {
|
||||
#[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
|
||||
async fn attach(self: &Arc<Tenant>) -> anyhow::Result<()> {
|
||||
// Create directory with marker file to indicate attaching state.
|
||||
// The load_local_tenants() function in tenant_mgr relies on the marker file
|
||||
// The load_local_tenants() function in tenant::mgr relies on the marker file
|
||||
// to determine whether a tenant has finished attaching.
|
||||
let tenant_dir = self.conf.tenant_path(&self.tenant_id);
|
||||
let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
|
||||
@@ -783,7 +789,7 @@ impl Tenant {
|
||||
// cannot be older than the local one
|
||||
let local_metadata = None;
|
||||
|
||||
self.setup_timeline(
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
Some(remote_client),
|
||||
Some(RemoteStartupData {
|
||||
@@ -1048,7 +1054,7 @@ impl Tenant {
|
||||
None => None,
|
||||
};
|
||||
|
||||
self.setup_timeline(
|
||||
self.timeline_init_and_sync(
|
||||
timeline_id,
|
||||
remote_client,
|
||||
remote_startup_data,
|
||||
@@ -1722,6 +1728,7 @@ impl Tenant {
|
||||
remote_storage,
|
||||
state,
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2359,6 +2366,24 @@ impl Tenant {
|
||||
|
||||
size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
|
||||
}
|
||||
|
||||
/// Calculate synthetic tenant size
|
||||
/// This is periodically called by background worker.
|
||||
/// result is cached in tenant struct
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
|
||||
pub async fn calculate_synthetic_size(&self) -> anyhow::Result<u64> {
|
||||
let inputs = self.gather_size_inputs().await?;
|
||||
|
||||
let size = inputs.calculate()?;
|
||||
|
||||
self.cached_synthetic_tenant_size
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
Ok(size)
|
||||
}
|
||||
pub fn get_cached_synthetic_size(&self) -> u64 {
|
||||
self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
|
||||
@@ -2816,15 +2841,15 @@ mod tests {
|
||||
drop(writer);
|
||||
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x10)).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x1f)).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x20)).await?,
|
||||
TEST_IMG("foo at 0x20")
|
||||
);
|
||||
|
||||
@@ -2903,15 +2928,15 @@ mod tests {
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
|
||||
from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
|
||||
"foo at 0x40"
|
||||
);
|
||||
assert_eq!(
|
||||
from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).no_ondemand_download()?)?,
|
||||
from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
|
||||
"bar at 0x40"
|
||||
);
|
||||
assert_eq!(
|
||||
from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).no_ondemand_download()?)?,
|
||||
from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).await?)?,
|
||||
"foobar at 0x20"
|
||||
);
|
||||
|
||||
@@ -3070,10 +3095,7 @@ mod tests {
|
||||
tenant
|
||||
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
|
||||
.await?;
|
||||
assert!(newtline
|
||||
.get(*TEST_KEY, Lsn(0x25))
|
||||
.no_ondemand_download()
|
||||
.is_ok());
|
||||
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).await.is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -3103,7 +3125,7 @@ mod tests {
|
||||
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x50)).no_ondemand_download()?,
|
||||
newtline.get(*TEST_KEY, Lsn(0x50)).await?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
|
||||
);
|
||||
|
||||
@@ -3251,23 +3273,23 @@ mod tests {
|
||||
tline.compact().await?;
|
||||
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x10)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x10)).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x1f)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x1f)).await?,
|
||||
TEST_IMG("foo at 0x10")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x20)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x20)).await?,
|
||||
TEST_IMG("foo at 0x20")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x30)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x30)).await?,
|
||||
TEST_IMG("foo at 0x30")
|
||||
);
|
||||
assert_eq!(
|
||||
tline.get(*TEST_KEY, Lsn(0x40)).no_ondemand_download()?,
|
||||
tline.get(*TEST_KEY, Lsn(0x40)).await?,
|
||||
TEST_IMG("foo at 0x40")
|
||||
);
|
||||
|
||||
@@ -3377,7 +3399,7 @@ mod tests {
|
||||
for (blknum, last_lsn) in updated.iter().enumerate() {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn).no_ondemand_download()?,
|
||||
tline.get(test_key, lsn).await?,
|
||||
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
|
||||
);
|
||||
}
|
||||
@@ -3463,7 +3485,7 @@ mod tests {
|
||||
for (blknum, last_lsn) in updated.iter().enumerate() {
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, lsn).no_ondemand_download()?,
|
||||
tline.get(test_key, lsn).await?,
|
||||
TEST_IMG(&format!("{} at {}", blknum, last_lsn))
|
||||
);
|
||||
}
|
||||
@@ -3538,7 +3560,7 @@ mod tests {
|
||||
println!("checking [{idx}][{blknum}] at {lsn}");
|
||||
test_key.field6 = blknum as u32;
|
||||
assert_eq!(
|
||||
tline.get(test_key, *lsn).no_ondemand_download()?,
|
||||
tline.get(test_key, *lsn).await?,
|
||||
TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
//! unless the pageserver is configured without remote storage.
|
||||
//!
|
||||
//! We allocate the client instance in [Timeline][`crate::tenant::Timeline`], i.e.,
|
||||
//! either in [`crate::tenant_mgr`] during startup or when creating a new
|
||||
//! either in [`crate::tenant::mgr`] during startup or when creating a new
|
||||
//! timeline.
|
||||
//! However, the client does not become ready for use until we've initialized its upload queue:
|
||||
//!
|
||||
@@ -135,7 +135,7 @@
|
||||
//! - Initiate upload queue with that [`IndexPart`].
|
||||
//! - Reschedule all lost operations by comparing the local filesystem state
|
||||
//! and remote state as per [`IndexPart`]. This is done in
|
||||
//! [`Timeline::setup_timeline`] and [`Timeline::reconcile_with_remote`].
|
||||
//! [`Timeline::timeline_init_and_sync`] and [`Timeline::reconcile_with_remote`].
|
||||
//!
|
||||
//! Note that if we crash during file deletion between the index update
|
||||
//! that removes the file from the list of files, and deleting the remote file,
|
||||
@@ -756,7 +756,7 @@ impl RemoteTimelineClient {
|
||||
// Note: We only check for the shutdown requests between retries, so
|
||||
// if a shutdown request arrives while we're busy uploading, in the
|
||||
// upload::upload:*() call below, we will wait not exit until it has
|
||||
// finisheed. We probably could cancel the upload by simply dropping
|
||||
// finished. We probably could cancel the upload by simply dropping
|
||||
// the Future, but we're not 100% sure if the remote storage library
|
||||
// is cancellation safe, so we don't dare to do that. Hopefully, the
|
||||
// upload finishes or times out soon enough.
|
||||
|
||||
@@ -44,6 +44,116 @@ struct TimelineInputs {
|
||||
next_gc_cutoff: Lsn,
|
||||
}
|
||||
|
||||
// Adjust BranchFrom sorting so that we always process ancestor
|
||||
// before descendants. This is needed to correctly calculate size of
|
||||
// descendant timelines.
|
||||
//
|
||||
// Note that we may have multiple BranchFroms at the same LSN, so we
|
||||
// need to sort them in the tree order.
|
||||
//
|
||||
// see updates_sort_with_branches_at_same_lsn test below
|
||||
fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update>> {
|
||||
let mut sorted_updates = Vec::with_capacity(updates.len());
|
||||
let mut known_timelineids = HashSet::new();
|
||||
let mut i = 0;
|
||||
while i < updates.len() {
|
||||
let curr_upd = &updates[i];
|
||||
|
||||
if let Command::BranchFrom(parent_id) = curr_upd.command {
|
||||
let parent_id = match parent_id {
|
||||
Some(parent_id) if known_timelineids.contains(&parent_id) => {
|
||||
// we have already processed ancestor
|
||||
// process this BranchFrom Update normally
|
||||
known_timelineids.insert(curr_upd.timeline_id);
|
||||
sorted_updates.push(*curr_upd);
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
None => {
|
||||
known_timelineids.insert(curr_upd.timeline_id);
|
||||
sorted_updates.push(*curr_upd);
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
Some(parent_id) => parent_id,
|
||||
};
|
||||
|
||||
let mut j = i;
|
||||
|
||||
// we have not processed ancestor yet.
|
||||
// there is a chance that it is at the same Lsn
|
||||
if !known_timelineids.contains(&parent_id) {
|
||||
let mut curr_lsn_branchfroms: HashMap<TimelineId, Vec<(TimelineId, usize)>> =
|
||||
HashMap::new();
|
||||
|
||||
// inspect all branchpoints at the same lsn
|
||||
while j < updates.len() && updates[j].lsn == curr_upd.lsn {
|
||||
let lookahead_upd = &updates[j];
|
||||
j += 1;
|
||||
|
||||
if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command {
|
||||
match lookahead_parent_id {
|
||||
Some(lookahead_parent_id)
|
||||
if !known_timelineids.contains(&lookahead_parent_id) =>
|
||||
{
|
||||
// we have not processed ancestor yet
|
||||
// store it for later
|
||||
let es =
|
||||
curr_lsn_branchfroms.entry(lookahead_parent_id).or_default();
|
||||
es.push((lookahead_upd.timeline_id, j));
|
||||
}
|
||||
_ => {
|
||||
// we have already processed ancestor
|
||||
// process this BranchFrom Update normally
|
||||
known_timelineids.insert(lookahead_upd.timeline_id);
|
||||
sorted_updates.push(*lookahead_upd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// process BranchFroms in the tree order
|
||||
// check that we don't have a cycle if somet entry is orphan
|
||||
// (this should not happen, but better to be safe)
|
||||
let mut processed_some_entry = true;
|
||||
while processed_some_entry {
|
||||
processed_some_entry = false;
|
||||
|
||||
curr_lsn_branchfroms.retain(|parent_id, branchfroms| {
|
||||
if known_timelineids.contains(parent_id) {
|
||||
for (timeline_id, j) in branchfroms {
|
||||
known_timelineids.insert(*timeline_id);
|
||||
sorted_updates.push(updates[*j - 1]);
|
||||
}
|
||||
processed_some_entry = true;
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if !curr_lsn_branchfroms.is_empty() {
|
||||
// orphans are expected to be rare and transient between tenant reloads
|
||||
// for example, an broken ancestor without the child branch being broken.
|
||||
anyhow::bail!(
|
||||
"orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
assert!(j > i);
|
||||
i = j;
|
||||
} else {
|
||||
// not a BranchFrom, keep the same order
|
||||
sorted_updates.push(*curr_upd);
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(sorted_updates)
|
||||
}
|
||||
|
||||
/// Gathers the inputs for the tenant sizing model.
|
||||
///
|
||||
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
|
||||
@@ -267,7 +377,11 @@ pub(super) async fn gather_inputs(
|
||||
// for branch points, which come as multiple updates at the same LSN, the Command::Update
|
||||
// is needed before a branch is made out of that branch Command::BranchFrom. this is
|
||||
// handled by the variant order in `Command`.
|
||||
//
|
||||
updates.sort_unstable();
|
||||
// And another sort to handle Command::BranchFrom ordering
|
||||
// in case when there are multiple branches at the same LSN.
|
||||
let sorted_updates = sort_updates_in_tree_order(updates)?;
|
||||
|
||||
let retention_period = match max_cutoff_distance {
|
||||
Some(max) => max.0,
|
||||
@@ -277,7 +391,7 @@ pub(super) async fn gather_inputs(
|
||||
};
|
||||
|
||||
Ok(ModelInputs {
|
||||
updates,
|
||||
updates: sorted_updates,
|
||||
retention_period,
|
||||
timeline_inputs,
|
||||
})
|
||||
@@ -295,6 +409,7 @@ impl ModelInputs {
|
||||
command: op,
|
||||
timeline_id,
|
||||
} = update;
|
||||
|
||||
let Lsn(now) = *lsn;
|
||||
match op {
|
||||
Command::Update(sz) => {
|
||||
@@ -304,7 +419,8 @@ impl ModelInputs {
|
||||
storage.insert_point(&Some(*timeline_id), "".into(), now, None);
|
||||
}
|
||||
Command::BranchFrom(parent) => {
|
||||
storage.branch(parent, Some(*timeline_id));
|
||||
// This branch command may fail if it cannot find a parent to branch from.
|
||||
storage.branch(parent, Some(*timeline_id))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -372,6 +488,7 @@ async fn calculate_logical_size(
|
||||
|
||||
let size_res = timeline
|
||||
.spawn_ondemand_logical_size_calculation(lsn)
|
||||
.instrument(info_span!("spawn_ondemand_logical_size_calculation"))
|
||||
.await?;
|
||||
Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
|
||||
}
|
||||
@@ -463,3 +580,137 @@ fn verify_size_for_multiple_branches() {
|
||||
|
||||
assert_eq!(inputs.calculate().unwrap(), 36_409_872);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn updates_sort_with_branches_at_same_lsn() {
|
||||
use std::str::FromStr;
|
||||
use Command::{BranchFrom, EndOfBranch};
|
||||
|
||||
macro_rules! lsn {
|
||||
($e:expr) => {
|
||||
Lsn::from_str($e).unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
let ids = [
|
||||
TimelineId::from_str("00000000000000000000000000000000").unwrap(),
|
||||
TimelineId::from_str("11111111111111111111111111111111").unwrap(),
|
||||
TimelineId::from_str("22222222222222222222222222222222").unwrap(),
|
||||
TimelineId::from_str("33333333333333333333333333333333").unwrap(),
|
||||
TimelineId::from_str("44444444444444444444444444444444").unwrap(),
|
||||
];
|
||||
|
||||
// issue https://github.com/neondatabase/neon/issues/3179
|
||||
let commands = vec![
|
||||
Update {
|
||||
lsn: lsn!("0/0"),
|
||||
command: BranchFrom(None),
|
||||
timeline_id: ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: Command::Update(25387008),
|
||||
timeline_id: ids[0],
|
||||
},
|
||||
// next three are wrongly sorted, because
|
||||
// ids[1] is branched from before ids[1] exists
|
||||
// and ids[2] is branched from before ids[2] exists
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: BranchFrom(Some(ids[1])),
|
||||
timeline_id: ids[3],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: BranchFrom(Some(ids[0])),
|
||||
timeline_id: ids[2],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: BranchFrom(Some(ids[2])),
|
||||
timeline_id: ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/1CA85B8"),
|
||||
command: Command::Update(28925952),
|
||||
timeline_id: ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/1CD85B8"),
|
||||
command: Command::Update(29024256),
|
||||
timeline_id: ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/1CD85B8"),
|
||||
command: BranchFrom(Some(ids[1])),
|
||||
timeline_id: ids[4],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/22DCE70"),
|
||||
command: Command::Update(32546816),
|
||||
timeline_id: ids[3],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/230CE70"),
|
||||
command: EndOfBranch,
|
||||
timeline_id: ids[3],
|
||||
},
|
||||
];
|
||||
|
||||
let expected = vec![
|
||||
Update {
|
||||
lsn: lsn!("0/0"),
|
||||
command: BranchFrom(None),
|
||||
timeline_id: ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: Command::Update(25387008),
|
||||
timeline_id: ids[0],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: BranchFrom(Some(ids[0])),
|
||||
timeline_id: ids[2],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: BranchFrom(Some(ids[2])),
|
||||
timeline_id: ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/169AD58"),
|
||||
command: BranchFrom(Some(ids[1])),
|
||||
timeline_id: ids[3],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/1CA85B8"),
|
||||
command: Command::Update(28925952),
|
||||
timeline_id: ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/1CD85B8"),
|
||||
command: Command::Update(29024256),
|
||||
timeline_id: ids[1],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/1CD85B8"),
|
||||
command: BranchFrom(Some(ids[1])),
|
||||
timeline_id: ids[4],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/22DCE70"),
|
||||
command: Command::Update(32546816),
|
||||
timeline_id: ids[3],
|
||||
},
|
||||
Update {
|
||||
lsn: lsn!("0/230CE70"),
|
||||
command: EndOfBranch,
|
||||
timeline_id: ids[3],
|
||||
},
|
||||
];
|
||||
|
||||
let sorted_commands = sort_updates_in_tree_order(commands).unwrap();
|
||||
|
||||
assert_eq!(sorted_commands, expected);
|
||||
}
|
||||
|
||||
@@ -109,7 +109,7 @@ pub trait Layer: Send + Sync {
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call, or a struct with a cached older image of the page if one
|
||||
/// is available. If this returns PageReconstructResult::Continue, look up
|
||||
/// is available. If this returns ValueReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_value_reconstruct_data(
|
||||
|
||||
@@ -83,7 +83,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("received cancellation request during idling");
|
||||
break ;
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
}
|
||||
|
||||
@@ -3,12 +3,12 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::Bytes;
|
||||
use fail::fail_point;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskState, TimelineState,
|
||||
DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
DownloadRemoteLayersTaskState, TimelineState,
|
||||
};
|
||||
use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
@@ -193,22 +193,29 @@ pub struct Timeline {
|
||||
}
|
||||
|
||||
/// Internal structure to hold all data needed for logical size calculation.
|
||||
/// Calculation consists of two parts:
|
||||
/// 1. Initial size calculation. That might take a long time, because it requires
|
||||
/// reading all layers containing relation sizes up to the `initial_part_end`.
|
||||
///
|
||||
/// Calculation consists of two stages:
|
||||
///
|
||||
/// 1. Initial size calculation. That might take a long time, because it requires
|
||||
/// reading all layers containing relation sizes at `initial_part_end`.
|
||||
///
|
||||
/// 2. Collecting an incremental part and adding that to the initial size.
|
||||
/// Increments are appended on walreceiver writing new timeline data,
|
||||
/// which result in increase or decrease of the logical size.
|
||||
struct LogicalSize {
|
||||
/// Size, potentially slow to compute, derived from all layers located locally on this node's FS.
|
||||
/// Might require reading multiple layers, and even ancestor's layers, to collect the size.
|
||||
/// Size, potentially slow to compute. Calculating this might require reading multiple
|
||||
/// layers, and even ancestor's layers.
|
||||
///
|
||||
/// NOTE: initial size is not a constant and will change between restarts.
|
||||
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
|
||||
/// the initial size at a different LSN.
|
||||
initial_logical_size: OnceCell<u64>,
|
||||
|
||||
/// Semaphore to track ongoing calculation of `initial_logical_size`.
|
||||
initial_size_computation: Arc<tokio::sync::Semaphore>,
|
||||
|
||||
/// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
|
||||
initial_part_end: Option<Lsn>,
|
||||
|
||||
/// All other size changes after startup, combined together.
|
||||
///
|
||||
/// Size shouldn't ever be negative, but this is signed for two reasons:
|
||||
@@ -335,43 +342,6 @@ pub struct WalReceiverInfo {
|
||||
pub last_received_msg_ts: u128,
|
||||
}
|
||||
|
||||
/// Like `?`, but for [`PageReconstructResult`].
|
||||
/// Use it to bubble up the `NeedsDownload` and `Error` to the caller.
|
||||
///
|
||||
/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
|
||||
#[macro_export]
|
||||
macro_rules! try_no_ondemand_download {
|
||||
($result:expr) => {{
|
||||
let result = $result;
|
||||
match result {
|
||||
PageReconstructResult::Success(value) => value,
|
||||
PageReconstructResult::NeedsDownload(timeline, layer) => {
|
||||
return PageReconstructResult::NeedsDownload(timeline, layer);
|
||||
}
|
||||
PageReconstructResult::Error(e) => return PageReconstructResult::Error(e),
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// Replacement for `?` in functions that return [`PageReconstructResult`].
|
||||
///
|
||||
/// Given an `expr: Result<T, E>`, use `try_page_reconstruct_result!(expr)`
|
||||
/// instead of `(expr)?`.
|
||||
/// If `expr` is `Ok(v)`, the macro evaluates to `v`.
|
||||
/// If `expr` is `Err(e)`, the macro returns `PageReconstructResult::Error(e.into())`.
|
||||
///
|
||||
/// Once `std::ops::Try` is stabilized, we should use it instead of this macro.
|
||||
#[macro_export]
|
||||
macro_rules! try_page_reconstruct_result {
|
||||
($result:expr) => {{
|
||||
let result = $result;
|
||||
match result {
|
||||
Ok(v) => v,
|
||||
Err(e) => return PageReconstructResult::from(e),
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
///
|
||||
/// Information about how much history needs to be retained, needed by
|
||||
/// Garbage Collection.
|
||||
@@ -401,21 +371,13 @@ pub struct GcInfo {
|
||||
pub pitr_cutoff: Lsn,
|
||||
}
|
||||
|
||||
pub enum PageReconstructResult<T> {
|
||||
Success(T),
|
||||
/// The given RemoteLayer needs to be downloaded and replaced in the timeline's layer map
|
||||
/// for the operation to succeed. Use [`Timeline::download_remote_layer`] to do it, then
|
||||
/// retry the operation that returned this error.
|
||||
NeedsDownload(Weak<Timeline>, Weak<RemoteLayer>),
|
||||
Error(PageReconstructError),
|
||||
}
|
||||
|
||||
/// An error happened in a get() operation.
|
||||
#[derive(thiserror::Error)]
|
||||
pub enum PageReconstructError {
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
|
||||
|
||||
/// An error happened replaying WAL records
|
||||
#[error(transparent)]
|
||||
WalRedo(#[from] crate::walredo::WalRedoError),
|
||||
}
|
||||
@@ -429,49 +391,6 @@ impl std::fmt::Debug for PageReconstructError {
|
||||
}
|
||||
}
|
||||
|
||||
/// This impl makes it so you can substitute return type
|
||||
/// `Result<T, E>` with `PageReconstructError<T>` in functions
|
||||
/// and existing `?` will generally continue to work.
|
||||
/// The reason why thanks to
|
||||
/// anyhow::Error that `(some error type)ensures that exis
|
||||
impl<E, T> From<E> for PageReconstructResult<T>
|
||||
where
|
||||
E: Into<PageReconstructError>,
|
||||
{
|
||||
fn from(e: E) -> Self {
|
||||
Self::Error(e.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> PageReconstructResult<T> {
|
||||
/// Treat the need for on-demand download as an error.
|
||||
///
|
||||
/// **Avoid this function in new code** if you can help it,
|
||||
/// as on-demand download will become the norm in the future,
|
||||
/// especially once we implement layer file eviction.
|
||||
///
|
||||
/// If you are in an async function, use [`with_ondemand_download`]
|
||||
/// to do the download right here.
|
||||
///
|
||||
/// If you are in a sync function, change its return type from
|
||||
/// `Result<T, E>` to `PageReconstructResult<T>` and bubble up
|
||||
/// the non-success cases of `PageReconstructResult<T>` to the caller.
|
||||
/// This gives them a chance to do the download and retry.
|
||||
/// Consider using [`try_no_ondemand_download`] for convenience.
|
||||
///
|
||||
/// For more background, read the comment on [`with_ondemand_download`].
|
||||
pub fn no_ondemand_download(self) -> anyhow::Result<T> {
|
||||
match self {
|
||||
PageReconstructResult::Success(value) => Ok(value),
|
||||
// TODO print more info about the timeline
|
||||
PageReconstructResult::NeedsDownload(_, _) => anyhow::bail!("Layer needs downloading"),
|
||||
PageReconstructResult::Error(e) => {
|
||||
Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline {
|
||||
/// Get the LSN where this branch was created
|
||||
@@ -493,15 +412,19 @@ impl Timeline {
|
||||
|
||||
/// Look up given page version.
|
||||
///
|
||||
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction
|
||||
/// above this needs to store suitable metadata to track what data exists with
|
||||
/// what keys, in separate metadata entries. If a non-existent key is requested,
|
||||
/// the Repository implementation may incorrectly return a value from an ancestor
|
||||
/// branch, for example, or waste a lot of cycles chasing the non-existing key.
|
||||
/// If a remote layer file is needed, it is downloaded as part of this
|
||||
/// call.
|
||||
///
|
||||
pub fn get(&self, key: Key, lsn: Lsn) -> PageReconstructResult<Bytes> {
|
||||
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The
|
||||
/// abstraction above this needs to store suitable metadata to track what
|
||||
/// data exists with what keys, in separate metadata entries. If a
|
||||
/// non-existent key is requested, we may incorrectly return a value from
|
||||
/// an ancestor branch, for example, or waste a lot of cycles chasing the
|
||||
/// non-existing key.
|
||||
///
|
||||
pub async fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
|
||||
if !lsn.is_valid() {
|
||||
return PageReconstructResult::from(anyhow!("Invalid LSN"));
|
||||
return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
|
||||
}
|
||||
|
||||
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
|
||||
@@ -512,7 +435,7 @@ impl Timeline {
|
||||
Some((cached_lsn, cached_img)) => {
|
||||
match cached_lsn.cmp(&lsn) {
|
||||
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
|
||||
Ordering::Equal => return PageReconstructResult::Success(cached_img), // exact LSN match, return the image
|
||||
Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
|
||||
Ordering::Greater => {
|
||||
unreachable!("the returned lsn should never be after the requested lsn")
|
||||
}
|
||||
@@ -527,18 +450,14 @@ impl Timeline {
|
||||
img: cached_page_img,
|
||||
};
|
||||
|
||||
try_no_ondemand_download!(self.get_reconstruct_data(key, lsn, &mut reconstruct_state));
|
||||
self.get_reconstruct_data(key, lsn, &mut reconstruct_state)
|
||||
.await?;
|
||||
|
||||
self.metrics
|
||||
.reconstruct_time_histo
|
||||
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
|
||||
}
|
||||
|
||||
// Like get(), but if a remote layer file is needed, it is downloaded as part of this call.
|
||||
pub async fn get_download(&self, key: Key, lsn: Lsn) -> anyhow::Result<Bytes> {
|
||||
with_ondemand_download(|| self.get(key, lsn)).await
|
||||
}
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
pub fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
@@ -1511,7 +1430,8 @@ impl Timeline {
|
||||
|
||||
/// Calculate the logical size of the database at the latest LSN.
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors, this can be a slow operation.
|
||||
/// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
|
||||
/// especially if we need to download remote layers.
|
||||
async fn calculate_logical_size(
|
||||
&self,
|
||||
up_to_lsn: Lsn,
|
||||
@@ -1630,12 +1550,12 @@ impl Timeline {
|
||||
///
|
||||
/// This function takes the current timeline's locked LayerMap as an argument,
|
||||
/// so callers can avoid potential race conditions.
|
||||
fn get_reconstruct_data(
|
||||
async fn get_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> PageReconstructResult<()> {
|
||||
) -> Result<(), PageReconstructError> {
|
||||
// Start from the current timeline.
|
||||
let mut timeline_owned;
|
||||
let mut timeline = self;
|
||||
@@ -1662,34 +1582,34 @@ impl Timeline {
|
||||
// The function should have updated 'state'
|
||||
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
|
||||
match result {
|
||||
ValueReconstructResult::Complete => return PageReconstructResult::Success(()),
|
||||
ValueReconstructResult::Complete => return Ok(()),
|
||||
ValueReconstructResult::Continue => {
|
||||
// If we reached an earlier cached page image, we're done.
|
||||
if cont_lsn == cached_lsn + 1 {
|
||||
self.metrics.materialized_page_cache_hit_counter.inc_by(1);
|
||||
return PageReconstructResult::Success(());
|
||||
return Ok(());
|
||||
}
|
||||
if prev_lsn <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return layer_traversal_error(format!(
|
||||
return Err(layer_traversal_error(format!(
|
||||
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
|
||||
key,
|
||||
Lsn(cont_lsn.0 - 1),
|
||||
request_lsn,
|
||||
timeline.ancestor_lsn
|
||||
), traversal_path);
|
||||
), traversal_path));
|
||||
}
|
||||
prev_lsn = cont_lsn;
|
||||
}
|
||||
ValueReconstructResult::Missing => {
|
||||
return layer_traversal_error(
|
||||
return Err(layer_traversal_error(
|
||||
format!(
|
||||
"could not find data for key {} at LSN {}, for request at LSN {}",
|
||||
key, cont_lsn, request_lsn
|
||||
),
|
||||
traversal_path,
|
||||
);
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1702,7 +1622,7 @@ impl Timeline {
|
||||
);
|
||||
let ancestor = match timeline.get_ancestor_timeline() {
|
||||
Ok(timeline) => timeline,
|
||||
Err(e) => return PageReconstructResult::from(e),
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
timeline_owned = ancestor;
|
||||
timeline = &*timeline_owned;
|
||||
@@ -1711,7 +1631,7 @@ impl Timeline {
|
||||
}
|
||||
|
||||
#[allow(clippy::never_loop)] // see comment at bottom of this loop
|
||||
'_layer_map_search: loop {
|
||||
'layer_map_search: loop {
|
||||
let remote_layer = {
|
||||
let layers = timeline.layers.read().unwrap();
|
||||
|
||||
@@ -1730,7 +1650,7 @@ impl Timeline {
|
||||
reconstruct_state,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return PageReconstructResult::from(e),
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
traversal_path.push((
|
||||
@@ -1755,7 +1675,7 @@ impl Timeline {
|
||||
reconstruct_state,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return PageReconstructResult::from(e),
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
traversal_path.push((
|
||||
@@ -1788,7 +1708,7 @@ impl Timeline {
|
||||
reconstruct_state,
|
||||
) {
|
||||
Ok(result) => result,
|
||||
Err(e) => return PageReconstructResult::from(e),
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
traversal_path.push((
|
||||
@@ -1812,27 +1732,24 @@ impl Timeline {
|
||||
continue 'outer;
|
||||
}
|
||||
};
|
||||
// Indicate to the caller that we need remote_layer replaced with a downloaded
|
||||
// layer in the layer map. The control flow could be a lot simpler, but the point
|
||||
// of this commit is to prepare this function to
|
||||
// 1. become async
|
||||
// 2. do the download right here, using
|
||||
// ```
|
||||
// download_remote_layer().await?;
|
||||
// continue 'layer_map_search;
|
||||
// ```
|
||||
// For (2), current rustc requires that the layers lock guard is not in scope.
|
||||
// Hence, the complicated control flow.
|
||||
// Download the remote_layer and replace it in the layer map.
|
||||
// For that, we need to release the mutex. Otherwise, we'd deadlock.
|
||||
//
|
||||
// The control flow is so weird here because `drop(layers)` inside
|
||||
// the if stmt above is not enough for current rustc: it requires
|
||||
// that the layers lock guard is not in scope across the download
|
||||
// await point.
|
||||
let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
|
||||
Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
|
||||
info!(
|
||||
"need remote layer {}",
|
||||
remote_layer_as_persistent.traversal_id()
|
||||
);
|
||||
return PageReconstructResult::NeedsDownload(
|
||||
Weak::clone(&timeline.myself),
|
||||
Arc::downgrade(&remote_layer),
|
||||
);
|
||||
let id = remote_layer_as_persistent.traversal_id();
|
||||
info!("need remote layer {id}");
|
||||
|
||||
// The next layer doesn't exist locally. Need to download it.
|
||||
// (The control flow is a bit complicated here because we must drop the 'layers'
|
||||
// lock before awaiting on the Future.)
|
||||
info!("on-demand downloading remote layer {id}");
|
||||
timeline.download_remote_layer(remote_layer).await?;
|
||||
continue 'layer_map_search;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2270,7 +2187,7 @@ impl Timeline {
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
) -> anyhow::Result<HashMap<LayerFileName, LayerFileMetadata>> {
|
||||
) -> Result<HashMap<LayerFileName, LayerFileMetadata>, PageReconstructError> {
|
||||
let timer = self.metrics.create_images_time_histo.start_timer();
|
||||
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
||||
for partition in partitioning.parts.iter() {
|
||||
@@ -2286,13 +2203,15 @@ impl Timeline {
|
||||
)?;
|
||||
|
||||
fail_point!("image-layer-writer-fail-before-finish", |_| {
|
||||
anyhow::bail!("failpoint image-layer-writer-fail-before-finish");
|
||||
Err(PageReconstructError::Other(anyhow::anyhow!(
|
||||
"failpoint image-layer-writer-fail-before-finish"
|
||||
)))
|
||||
});
|
||||
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
let img = match self.get_download(key, lsn).await {
|
||||
let img = match self.get(key, lsn).await {
|
||||
Ok(img) => img,
|
||||
Err(err) => {
|
||||
// If we fail to reconstruct a VM or FSM page, we can zero the
|
||||
@@ -2343,7 +2262,7 @@ impl Timeline {
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
))
|
||||
.collect::<Vec<_>>();
|
||||
par_fsync::par_fsync(&all_paths)?;
|
||||
par_fsync::par_fsync(&all_paths).context("fsync of newly created layer files")?;
|
||||
|
||||
let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
|
||||
|
||||
@@ -2351,7 +2270,10 @@ impl Timeline {
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
for l in image_layers {
|
||||
let path = l.filename();
|
||||
let metadata = timeline_path.join(path.file_name()).metadata()?;
|
||||
let metadata = timeline_path
|
||||
.join(path.file_name())
|
||||
.metadata()
|
||||
.context("reading metadata of layer file {path}")?;
|
||||
|
||||
layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
|
||||
|
||||
@@ -2752,8 +2674,7 @@ impl Timeline {
|
||||
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
|
||||
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
|
||||
|
||||
match with_ondemand_download(|| self.find_lsn_for_timestamp(pitr_timestamp)).await?
|
||||
{
|
||||
match self.find_lsn_for_timestamp(pitr_timestamp).await? {
|
||||
LsnForTimestamp::Present(lsn) => lsn,
|
||||
LsnForTimestamp::Future(lsn) => {
|
||||
// The timestamp is in the future. That sounds impossible,
|
||||
@@ -3022,7 +2943,7 @@ impl Timeline {
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut data: ValueReconstructState,
|
||||
) -> PageReconstructResult<Bytes> {
|
||||
) -> Result<Bytes, PageReconstructError> {
|
||||
// Perform WAL redo if needed
|
||||
data.records.reverse();
|
||||
|
||||
@@ -3030,15 +2951,16 @@ impl Timeline {
|
||||
if data.records.is_empty() {
|
||||
if let Some((img_lsn, img)) = &data.img {
|
||||
trace!(
|
||||
"found page image for key {} at {}, no WAL redo required",
|
||||
"found page image for key {} at {}, no WAL redo required, req LSN {}",
|
||||
key,
|
||||
img_lsn
|
||||
img_lsn,
|
||||
request_lsn,
|
||||
);
|
||||
PageReconstructResult::Success(img.clone())
|
||||
Ok(img.clone())
|
||||
} else {
|
||||
PageReconstructResult::from(anyhow!(
|
||||
Err(PageReconstructError::from(anyhow!(
|
||||
"base image for {key} at {request_lsn} not found"
|
||||
))
|
||||
)))
|
||||
}
|
||||
} else {
|
||||
// We need to do WAL redo.
|
||||
@@ -3046,12 +2968,12 @@ impl Timeline {
|
||||
// If we don't have a base image, then the oldest WAL record better initialize
|
||||
// the page
|
||||
if data.img.is_none() && !data.records.first().unwrap().1.will_init() {
|
||||
PageReconstructResult::from(anyhow!(
|
||||
Err(PageReconstructError::from(anyhow!(
|
||||
"Base image for {} at {} not found, but got {} WAL records",
|
||||
key,
|
||||
request_lsn,
|
||||
data.records.len()
|
||||
))
|
||||
)))
|
||||
} else {
|
||||
if data.img.is_some() {
|
||||
trace!(
|
||||
@@ -3072,7 +2994,7 @@ impl Timeline {
|
||||
.context("Failed to reconstruct a page image:")
|
||||
{
|
||||
Ok(img) => img,
|
||||
Err(e) => return PageReconstructResult::from(e),
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
|
||||
if img.len() == page_cache::PAGE_SZ {
|
||||
@@ -3087,11 +3009,11 @@ impl Timeline {
|
||||
)
|
||||
.context("Materialized page memoization failed")
|
||||
{
|
||||
return PageReconstructResult::from(e);
|
||||
return Err(PageReconstructError::from(e));
|
||||
}
|
||||
}
|
||||
|
||||
PageReconstructResult::Success(img)
|
||||
Ok(img)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3117,7 +3039,7 @@ impl Timeline {
|
||||
/// So, the current download attempt will run to completion even if we stop polling.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%remote_layer.short_id()))]
|
||||
pub async fn download_remote_layer(
|
||||
self: Arc<Self>,
|
||||
&self,
|
||||
remote_layer: Arc<RemoteLayer>,
|
||||
) -> anyhow::Result<()> {
|
||||
let permit = match Arc::clone(&remote_layer.ongoing_download)
|
||||
@@ -3133,6 +3055,7 @@ impl Timeline {
|
||||
|
||||
let (sender, receiver) = tokio::sync::oneshot::channel();
|
||||
// Spawn a task so that download does not outlive timeline when we detach tenant / delete timeline.
|
||||
let self_clone = self.myself.upgrade().expect("timeline is gone");
|
||||
task_mgr::spawn(
|
||||
&tokio::runtime::Handle::current(),
|
||||
TaskKind::RemoteDownloadTask,
|
||||
@@ -3141,7 +3064,7 @@ impl Timeline {
|
||||
&format!("download layer {}", remote_layer.short_id()),
|
||||
false,
|
||||
async move {
|
||||
let remote_client = self.remote_client.as_ref().unwrap();
|
||||
let remote_client = self_clone.remote_client.as_ref().unwrap();
|
||||
|
||||
// Does retries + exponential back-off internally.
|
||||
// When this fails, don't layer further retry attempts here.
|
||||
@@ -3152,12 +3075,12 @@ impl Timeline {
|
||||
if let Ok(size) = &result {
|
||||
// XXX the temp file is still around in Err() case
|
||||
// and consumes space until we clean up upon pageserver restart.
|
||||
self.metrics.resident_physical_size_gauge.add(*size);
|
||||
self_clone.metrics.resident_physical_size_gauge.add(*size);
|
||||
|
||||
// Download complete. Replace the RemoteLayer with the corresponding
|
||||
// Delta- or ImageLayer in the layer map.
|
||||
let new_layer = remote_layer.create_downloaded_layer(self.conf, *size);
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
|
||||
let mut layers = self_clone.layers.write().unwrap();
|
||||
{
|
||||
let l: Arc<dyn PersistentLayer> = remote_layer.clone();
|
||||
layers.remove_historic(l);
|
||||
@@ -3193,6 +3116,7 @@ impl Timeline {
|
||||
|
||||
pub async fn spawn_download_all_remote_layers(
|
||||
self: Arc<Self>,
|
||||
request: DownloadRemoteLayersTaskSpawnRequest,
|
||||
) -> Result<DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskInfo> {
|
||||
let mut status_guard = self.download_all_remote_layers_task_info.write().unwrap();
|
||||
if let Some(st) = &*status_guard {
|
||||
@@ -3216,7 +3140,7 @@ impl Timeline {
|
||||
"download all remote layers task",
|
||||
false,
|
||||
async move {
|
||||
self_clone.download_all_remote_layers().await;
|
||||
self_clone.download_all_remote_layers(request).await;
|
||||
let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
|
||||
match &mut *status_guard {
|
||||
None => {
|
||||
@@ -3248,20 +3172,23 @@ impl Timeline {
|
||||
Ok(initial_info)
|
||||
}
|
||||
|
||||
async fn download_all_remote_layers(self: &Arc<Self>) {
|
||||
let mut downloads: FuturesUnordered<_> = {
|
||||
async fn download_all_remote_layers(
|
||||
self: &Arc<Self>,
|
||||
request: DownloadRemoteLayersTaskSpawnRequest,
|
||||
) {
|
||||
let mut downloads = Vec::new();
|
||||
{
|
||||
let layers = self.layers.read().unwrap();
|
||||
layers
|
||||
.iter_historic_layers()
|
||||
.filter_map(|l| l.downcast_remote_layer())
|
||||
.map({
|
||||
|l| {
|
||||
let self_clone = Arc::clone(self);
|
||||
self_clone.download_remote_layer(l)
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
.map(|l| self.download_remote_layer(l))
|
||||
.for_each(|dl| downloads.push(dl))
|
||||
}
|
||||
let total_layer_count = downloads.len();
|
||||
// limit download concurrency as specified in request
|
||||
let downloads = futures::stream::iter(downloads);
|
||||
let mut downloads = downloads.buffer_unordered(request.max_concurrent_downloads.get());
|
||||
|
||||
macro_rules! lock_status {
|
||||
($st:ident) => {
|
||||
@@ -3282,7 +3209,7 @@ impl Timeline {
|
||||
|
||||
{
|
||||
lock_status!(st);
|
||||
st.total_layer_count = downloads.len().try_into().unwrap();
|
||||
st.total_layer_count = total_layer_count as u64;
|
||||
}
|
||||
loop {
|
||||
tokio::select! {
|
||||
@@ -3321,101 +3248,15 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to deal with [`PageReconstructResult`].
|
||||
///
|
||||
/// Takes a sync closure that returns a [`PageReconstructResult`].
|
||||
/// If it is [`PageReconstructResult::NeedsDownload`],
|
||||
/// do the download and retry the closure.
|
||||
///
|
||||
/// ### Background
|
||||
///
|
||||
/// This is a crutch to make on-demand downloads efficient in
|
||||
/// our async-sync-async sandwich codebase. Some context:
|
||||
///
|
||||
/// - The code that does the downloads uses async Rust.
|
||||
/// - The code that initiates download is many levels of sync Rust.
|
||||
/// - The sync code must wait for the download to finish to
|
||||
/// make further progress.
|
||||
/// - The sync code is invoked directly from async functions upstack.
|
||||
///
|
||||
/// Example (there are also much worse ones where the sandwich is taller)
|
||||
///
|
||||
/// async handle_get_page_at_lsn_request page_service.rs
|
||||
/// sync get_rel_page_at_lsn timeline.rs
|
||||
/// sync timeline.get timeline.rs
|
||||
/// sync get_reconstruct_data timeline.rs
|
||||
/// async download_remote_layer timeline.rs
|
||||
///
|
||||
/// It is not possible to Timeline::download_remote_layer().await within
|
||||
/// get_reconstruct_data, so instead, we return [`PageReconstructResult::NeedsDownload`]
|
||||
/// which contains references to the [`Timeline`] and [`RemoteLayer`].
|
||||
/// We bubble that error upstack to the async code, which can then call
|
||||
/// `Timeline::download_remote_layer().await`.
|
||||
/// That is _efficient_ because tokio can use the same OS thread to do
|
||||
/// other work while we're waiting for the download.
|
||||
///
|
||||
/// It is a deliberate decision to use a new result type to communicate
|
||||
/// the need for download instead of adding another variant to [`PageReconstructError`].
|
||||
/// The reason is that with the latter approach, any place that does
|
||||
/// `?` on a `Result<T, PageReconstructError>` will implicitly ignore the
|
||||
/// need for download. We want that to be explicit, so that
|
||||
/// - the code base becomes greppable for places that don't do a download
|
||||
/// - future code changes will need to explicilty address for on-demand download
|
||||
///
|
||||
/// Alternatives to consider in the future:
|
||||
///
|
||||
/// - Inside `get_reconstruct_data`, we can std::thread::spawn a thread
|
||||
/// and use it to block_on the download_remote_layer future.
|
||||
/// That is obviously inefficient as it creates one thread per download.
|
||||
/// - Convert everything to async. The problem here is that the sync
|
||||
/// functions are used by many other sync functions. So, the scope
|
||||
/// creep of such a conversion is tremendous.
|
||||
/// - Compromise between the two: implement async functions for each sync
|
||||
/// function. Switch over the hot code paths (GetPage()) to use the
|
||||
/// async path, so that the hot path doesn't spawn threads. Other code
|
||||
/// paths would remain sync initially, and get converted to async over time.
|
||||
///
|
||||
pub async fn with_ondemand_download<F, T>(mut f: F) -> Result<T, anyhow::Error>
|
||||
where
|
||||
F: Send + FnMut() -> PageReconstructResult<T>,
|
||||
T: Send,
|
||||
{
|
||||
loop {
|
||||
let closure_result = f();
|
||||
match closure_result {
|
||||
PageReconstructResult::NeedsDownload(weak_timeline, weak_remote_layer) => {
|
||||
// if the timeline is gone, it has likely been deleted / tenant detached
|
||||
let tl = weak_timeline.upgrade().context("timeline is gone")?;
|
||||
// if the remote layer got removed, retry the function, it might succeed now
|
||||
let remote_layer = match weak_remote_layer.upgrade() {
|
||||
None => {
|
||||
info!("remote layer is gone, retrying closure");
|
||||
continue;
|
||||
}
|
||||
Some(l) => l,
|
||||
};
|
||||
// Does retries internally
|
||||
tl.download_remote_layer(remote_layer).await?;
|
||||
// Download successful, retry the closure
|
||||
continue;
|
||||
}
|
||||
PageReconstructResult::Success(closure_value) => return Ok(closure_value),
|
||||
PageReconstructResult::Error(e) => {
|
||||
return Err(anyhow::Error::new(e).context("Failed to reconstruct the page"))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalPathItem = (
|
||||
ValueReconstructResult,
|
||||
Lsn,
|
||||
Box<dyn FnOnce() -> TraversalId>,
|
||||
Box<dyn Send + FnOnce() -> TraversalId>,
|
||||
);
|
||||
|
||||
/// Helper function for get_reconstruct_data() to add the path of layers traversed
|
||||
/// to an error, as anyhow context information.
|
||||
fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructResult<()> {
|
||||
fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
|
||||
// We want the original 'msg' to be the outermost context. The outermost context
|
||||
// is the most high-level information, which also gets propagated to the client.
|
||||
let mut msg_iter = path
|
||||
@@ -3434,7 +3275,7 @@ fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageRecon
|
||||
|
||||
// Append all subsequent traversals, and the error message 'msg', as contexts.
|
||||
let msg = msg_iter.fold(err, |err, msg| err.context(msg));
|
||||
PageReconstructResult::from(msg)
|
||||
PageReconstructError::from(msg)
|
||||
}
|
||||
|
||||
/// Various functions to mutate the timeline.
|
||||
|
||||
@@ -30,8 +30,8 @@ use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::tenant::{with_ondemand_download, PageReconstructError};
|
||||
use crate::walrecord::*;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
@@ -55,8 +55,7 @@ impl<'a> WalIngest<'a> {
|
||||
pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
|
||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||
// quickly in `ingest_record` and update it when it changes.
|
||||
let checkpoint_bytes =
|
||||
with_ondemand_download(|| timeline.get_checkpoint(startpoint)).await?;
|
||||
let checkpoint_bytes = timeline.get_checkpoint(startpoint).await?;
|
||||
let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
|
||||
|
||||
@@ -107,7 +106,7 @@ impl<'a> WalIngest<'a> {
|
||||
== pg_constants::XLOG_SMGR_CREATE
|
||||
{
|
||||
let create = XlSmgrCreate::decode(&mut buf);
|
||||
self.ingest_xlog_smgr_create(modification, &create)?;
|
||||
self.ingest_xlog_smgr_create(modification, &create).await?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
|
||||
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_SMGR_TRUNCATE
|
||||
@@ -135,7 +134,7 @@ impl<'a> WalIngest<'a> {
|
||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||
for tablespace_id in dropdb.tablespace_ids {
|
||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||
modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
|
||||
modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
|
||||
}
|
||||
}
|
||||
} else if self.timeline.pg_version == 15 {
|
||||
@@ -159,7 +158,7 @@ impl<'a> WalIngest<'a> {
|
||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||
for tablespace_id in dropdb.tablespace_ids {
|
||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||
modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
|
||||
modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -214,9 +213,11 @@ impl<'a> WalIngest<'a> {
|
||||
parsed_xact.xid,
|
||||
lsn,
|
||||
);
|
||||
modification.drop_twophase_file(parsed_xact.xid)?;
|
||||
modification.drop_twophase_file(parsed_xact.xid).await?;
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
modification.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?;
|
||||
modification
|
||||
.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))
|
||||
.await?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
@@ -250,11 +251,13 @@ impl<'a> WalIngest<'a> {
|
||||
self.ingest_multixact_create_record(modification, &xlrec)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
self.ingest_multixact_truncate_record(modification, &xlrec)?;
|
||||
self.ingest_multixact_truncate_record(modification, &xlrec)
|
||||
.await?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
self.ingest_relmap_page(modification, &xlrec, decoded)?;
|
||||
self.ingest_relmap_page(modification, &xlrec, decoded)
|
||||
.await?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
@@ -534,23 +537,21 @@ impl<'a> WalIngest<'a> {
|
||||
// get calls instead.
|
||||
let req_lsn = modification.tline.get_last_record_lsn();
|
||||
|
||||
let rels = with_ondemand_download(|| {
|
||||
modification
|
||||
.tline
|
||||
.list_rels(src_tablespace_id, src_db_id, req_lsn)
|
||||
})
|
||||
.await?;
|
||||
let rels = modification
|
||||
.tline
|
||||
.list_rels(src_tablespace_id, src_db_id, req_lsn)
|
||||
.await?;
|
||||
|
||||
debug!("ingest_xlog_dbase_create: {} rels", rels.len());
|
||||
|
||||
// Copy relfilemap
|
||||
let filemap = with_ondemand_download(|| {
|
||||
modification
|
||||
.tline
|
||||
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
|
||||
})
|
||||
.await?;
|
||||
modification.put_relmap_file(tablespace_id, db_id, filemap)?;
|
||||
let filemap = modification
|
||||
.tline
|
||||
.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
|
||||
.await?;
|
||||
modification
|
||||
.put_relmap_file(tablespace_id, db_id, filemap)
|
||||
.await?;
|
||||
|
||||
let mut num_rels_copied = 0;
|
||||
let mut num_blocks_copied = 0;
|
||||
@@ -558,9 +559,10 @@ impl<'a> WalIngest<'a> {
|
||||
assert_eq!(src_rel.spcnode, src_tablespace_id);
|
||||
assert_eq!(src_rel.dbnode, src_db_id);
|
||||
|
||||
let nblocks =
|
||||
with_ondemand_download(|| modification.tline.get_rel_size(src_rel, req_lsn, true))
|
||||
.await?;
|
||||
let nblocks = modification
|
||||
.tline
|
||||
.get_rel_size(src_rel, req_lsn, true)
|
||||
.await?;
|
||||
let dst_rel = RelTag {
|
||||
spcnode: tablespace_id,
|
||||
dbnode: db_id,
|
||||
@@ -568,19 +570,17 @@ impl<'a> WalIngest<'a> {
|
||||
forknum: src_rel.forknum,
|
||||
};
|
||||
|
||||
modification.put_rel_creation(dst_rel, nblocks)?;
|
||||
modification.put_rel_creation(dst_rel, nblocks).await?;
|
||||
|
||||
// Copy content
|
||||
debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
|
||||
for blknum in 0..nblocks {
|
||||
debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel);
|
||||
|
||||
let content = with_ondemand_download(|| {
|
||||
modification
|
||||
.tline
|
||||
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
|
||||
})
|
||||
.await?;
|
||||
let content = modification
|
||||
.tline
|
||||
.get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
|
||||
.await?;
|
||||
modification.put_rel_page_image(dst_rel, blknum, content)?;
|
||||
num_blocks_copied += 1;
|
||||
}
|
||||
@@ -595,9 +595,9 @@ impl<'a> WalIngest<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ingest_xlog_smgr_create(
|
||||
async fn ingest_xlog_smgr_create(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
rec: &XlSmgrCreate,
|
||||
) -> anyhow::Result<()> {
|
||||
let rel = RelTag {
|
||||
@@ -606,7 +606,7 @@ impl<'a> WalIngest<'a> {
|
||||
relnode: rec.rnode.relnode,
|
||||
forknum: rec.forknum,
|
||||
};
|
||||
self.put_rel_creation(modification, rel)?;
|
||||
self.put_rel_creation(modification, rel).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -629,7 +629,8 @@ impl<'a> WalIngest<'a> {
|
||||
relnode,
|
||||
forknum: MAIN_FORKNUM,
|
||||
};
|
||||
self.put_rel_truncation(modification, rel, rec.blkno)?;
|
||||
self.put_rel_truncation(modification, rel, rec.blkno)
|
||||
.await?;
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
|
||||
let rel = RelTag {
|
||||
@@ -650,7 +651,8 @@ impl<'a> WalIngest<'a> {
|
||||
let nblocks = self.get_relsize(rel, modification.lsn).await?;
|
||||
if nblocks > fsm_physical_page_no {
|
||||
// check if something to do: FSM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, fsm_physical_page_no)?;
|
||||
self.put_rel_truncation(modification, rel, fsm_physical_page_no)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
|
||||
@@ -671,7 +673,8 @@ impl<'a> WalIngest<'a> {
|
||||
let nblocks = self.get_relsize(rel, modification.lsn).await?;
|
||||
if nblocks > vm_page_no {
|
||||
// check if something to do: VM is larger than truncate position
|
||||
self.put_rel_truncation(modification, rel, vm_page_no)?;
|
||||
self.put_rel_truncation(modification, rel, vm_page_no)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -740,10 +743,12 @@ impl<'a> WalIngest<'a> {
|
||||
relnode: xnode.relnode,
|
||||
};
|
||||
let last_lsn = self.timeline.get_last_record_lsn();
|
||||
if with_ondemand_download(|| modification.tline.get_rel_exists(rel, last_lsn, true))
|
||||
if modification
|
||||
.tline
|
||||
.get_rel_exists(rel, last_lsn, true)
|
||||
.await?
|
||||
{
|
||||
self.put_rel_drop(modification, rel)?;
|
||||
self.put_rel_drop(modification, rel).await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -795,16 +800,16 @@ impl<'a> WalIngest<'a> {
|
||||
// instead.
|
||||
let req_lsn = modification.tline.get_last_record_lsn();
|
||||
|
||||
let slru_segments = with_ondemand_download(|| {
|
||||
modification
|
||||
.tline
|
||||
.list_slru_segments(SlruKind::Clog, req_lsn)
|
||||
})
|
||||
.await?;
|
||||
let slru_segments = modification
|
||||
.tline
|
||||
.list_slru_segments(SlruKind::Clog, req_lsn)
|
||||
.await?;
|
||||
for segno in slru_segments {
|
||||
let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
|
||||
modification.drop_slru_segment(SlruKind::Clog, segno)?;
|
||||
modification
|
||||
.drop_slru_segment(SlruKind::Clog, segno)
|
||||
.await?;
|
||||
trace!("Drop CLOG segment {:>04X}", segno);
|
||||
}
|
||||
}
|
||||
@@ -891,9 +896,9 @@ impl<'a> WalIngest<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ingest_multixact_truncate_record(
|
||||
async fn ingest_multixact_truncate_record(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
self.checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
@@ -909,7 +914,9 @@ impl<'a> WalIngest<'a> {
|
||||
// Delete all the segments except the last one. The last segment can still
|
||||
// contain, possibly partially, valid data.
|
||||
while segment != endsegment {
|
||||
modification.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)?;
|
||||
modification
|
||||
.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)
|
||||
.await?;
|
||||
|
||||
/* move to next segment, handling wraparound correctly */
|
||||
if segment == maxsegment {
|
||||
@@ -925,9 +932,9 @@ impl<'a> WalIngest<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ingest_relmap_page(
|
||||
async fn ingest_relmap_page(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
@@ -936,17 +943,19 @@ impl<'a> WalIngest<'a> {
|
||||
// skip xl_relmap_update
|
||||
buf.advance(12);
|
||||
|
||||
modification.put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))?;
|
||||
modification
|
||||
.put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put_rel_creation(
|
||||
async fn put_rel_creation(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
) -> Result<()> {
|
||||
modification.put_rel_creation(rel, 0)?;
|
||||
modification.put_rel_creation(rel, 0).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -974,28 +983,31 @@ impl<'a> WalIngest<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put_rel_truncation(
|
||||
async fn put_rel_truncation(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
) -> anyhow::Result<()> {
|
||||
modification.put_rel_truncation(rel, nblocks)?;
|
||||
modification.put_rel_truncation(rel, nblocks).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put_rel_drop(&mut self, modification: &mut DatadirModification, rel: RelTag) -> Result<()> {
|
||||
modification.put_rel_drop(rel)?;
|
||||
async fn put_rel_drop(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<'_>,
|
||||
rel: RelTag,
|
||||
) -> Result<()> {
|
||||
modification.put_rel_drop(rel).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
|
||||
let exists =
|
||||
with_ondemand_download(|| self.timeline.get_rel_exists(rel, lsn, true)).await?;
|
||||
let exists = self.timeline.get_rel_exists(rel, lsn, true).await?;
|
||||
let nblocks = if !exists {
|
||||
0
|
||||
} else {
|
||||
with_ondemand_download(|| self.timeline.get_rel_size(rel, lsn, true)).await?
|
||||
self.timeline.get_rel_size(rel, lsn, true).await?
|
||||
};
|
||||
Ok(nblocks)
|
||||
}
|
||||
@@ -1011,19 +1023,17 @@ impl<'a> WalIngest<'a> {
|
||||
// record.
|
||||
// TODO: would be nice if to be more explicit about it
|
||||
let last_lsn = modification.lsn;
|
||||
let old_nblocks =
|
||||
if !with_ondemand_download(|| self.timeline.get_rel_exists(rel, last_lsn, true)).await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
modification.put_rel_creation(rel, 0)?;
|
||||
0
|
||||
} else {
|
||||
with_ondemand_download(|| self.timeline.get_rel_size(rel, last_lsn, true)).await?
|
||||
};
|
||||
let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true).await? {
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
modification.put_rel_creation(rel, 0).await?;
|
||||
0
|
||||
} else {
|
||||
self.timeline.get_rel_size(rel, last_lsn, true).await?
|
||||
};
|
||||
|
||||
if new_nblocks > old_nblocks {
|
||||
//info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
|
||||
modification.put_rel_extend(rel, new_nblocks)?;
|
||||
modification.put_rel_extend(rel, new_nblocks).await?;
|
||||
|
||||
// fill the gap with zeros
|
||||
for gap_blknum in old_nblocks..blknum {
|
||||
@@ -1063,16 +1073,19 @@ impl<'a> WalIngest<'a> {
|
||||
// record.
|
||||
// TODO: would be nice if to be more explicit about it
|
||||
let last_lsn = self.timeline.get_last_record_lsn();
|
||||
let old_nblocks = if !with_ondemand_download(|| {
|
||||
self.timeline.get_slru_segment_exists(kind, segno, last_lsn)
|
||||
})
|
||||
.await?
|
||||
let old_nblocks = if !self
|
||||
.timeline
|
||||
.get_slru_segment_exists(kind, segno, last_lsn)
|
||||
.await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
modification.put_slru_segment_creation(kind, segno, 0)?;
|
||||
modification
|
||||
.put_slru_segment_creation(kind, segno, 0)
|
||||
.await?;
|
||||
0
|
||||
} else {
|
||||
with_ondemand_download(|| self.timeline.get_slru_segment_size(kind, segno, last_lsn))
|
||||
self.timeline
|
||||
.get_slru_segment_size(kind, segno, last_lsn)
|
||||
.await?
|
||||
};
|
||||
|
||||
@@ -1124,7 +1137,7 @@ mod tests {
|
||||
async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
|
||||
let mut m = tline.begin_modification(Lsn(0x10));
|
||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||
m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
|
||||
m.put_relmap_file(0, 111, Bytes::from("")).await?; // dummy relmapper file
|
||||
m.commit()?;
|
||||
let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
|
||||
|
||||
@@ -1138,7 +1151,7 @@ mod tests {
|
||||
let mut walingest = init_walingest_test(&tline).await?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
walingest.put_rel_creation(&mut m, TESTREL_A)?;
|
||||
walingest.put_rel_creation(&mut m, TESTREL_A).await?;
|
||||
walingest
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
|
||||
.await?;
|
||||
@@ -1163,132 +1176,103 @@ mod tests {
|
||||
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x10), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
|
||||
false
|
||||
);
|
||||
assert!(tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x10), false)
|
||||
.no_ondemand_download()
|
||||
.await
|
||||
.is_err());
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x20), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x20), false)
|
||||
.no_ondemand_download()?,
|
||||
1
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
3
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
|
||||
|
||||
// Check page contents at each LSN
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 2")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
let mut m = tline.begin_modification(Lsn(0x60));
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 2)?;
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 2).await?;
|
||||
m.commit()?;
|
||||
assert_current_logical_size(&tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x60), false)
|
||||
.no_ondemand_download()?,
|
||||
2
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 2);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 0 at 3")
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1 at 4")
|
||||
);
|
||||
|
||||
// should still see the truncated block with older LSN
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
3
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 2 at 5")
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
let mut m = tline.begin_modification(Lsn(0x68));
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 0)?;
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 0).await?;
|
||||
m.commit()?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x68), false)
|
||||
.no_ondemand_download()?,
|
||||
0
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false).await?, 0);
|
||||
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
let mut m = tline.begin_modification(Lsn(0x70));
|
||||
@@ -1296,22 +1280,17 @@ mod tests {
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
|
||||
.await?;
|
||||
m.commit()?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x70), false)
|
||||
.no_ondemand_download()?,
|
||||
2
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false).await?, 2);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
ZERO_PAGE
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1")
|
||||
);
|
||||
|
||||
@@ -1321,24 +1300,19 @@ mod tests {
|
||||
.put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
|
||||
.await?;
|
||||
m.commit()?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x80), false)
|
||||
.no_ondemand_download()?,
|
||||
1501
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?, 1501);
|
||||
for blk in 2..1500 {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
ZERO_PAGE
|
||||
);
|
||||
}
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG("foo blk 1500")
|
||||
);
|
||||
|
||||
@@ -1361,28 +1335,19 @@ mod tests {
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x20), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x20), false)
|
||||
.no_ondemand_download()?,
|
||||
1
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
|
||||
|
||||
// Drop rel
|
||||
let mut m = tline.begin_modification(Lsn(0x30));
|
||||
walingest.put_rel_drop(&mut m, TESTREL_A)?;
|
||||
walingest.put_rel_drop(&mut m, TESTREL_A).await?;
|
||||
m.commit()?;
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x30), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x30), false).await?,
|
||||
false
|
||||
);
|
||||
|
||||
@@ -1398,17 +1363,10 @@ mod tests {
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x40), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x40), false).await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x40), false)
|
||||
.no_ondemand_download()?,
|
||||
1
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false).await?, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1435,26 +1393,20 @@ mod tests {
|
||||
|
||||
// The relation was created at LSN 20, not visible at LSN 1 yet.
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x10), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
|
||||
false
|
||||
);
|
||||
assert!(tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x10), false)
|
||||
.no_ondemand_download()
|
||||
.await
|
||||
.is_err());
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x20), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x20), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?,
|
||||
relsize
|
||||
);
|
||||
|
||||
@@ -1465,7 +1417,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
@@ -1473,16 +1425,11 @@ mod tests {
|
||||
// Truncate relation so that second segment was dropped
|
||||
// - only leave one page
|
||||
let mut m = tline.begin_modification(Lsn(0x60));
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 1)?;
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, 1).await?;
|
||||
m.commit()?;
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x60), false)
|
||||
.no_ondemand_download()?,
|
||||
1
|
||||
);
|
||||
assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 1);
|
||||
|
||||
for blkno in 0..1 {
|
||||
let lsn = Lsn(0x20);
|
||||
@@ -1490,16 +1437,14 @@ mod tests {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
|
||||
// should still see all blocks with older LSN
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?,
|
||||
relsize
|
||||
);
|
||||
for blkno in 0..relsize {
|
||||
@@ -1508,7 +1453,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
@@ -1526,15 +1471,11 @@ mod tests {
|
||||
m.commit()?;
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_exists(TESTREL_A, Lsn(0x80), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_exists(TESTREL_A, Lsn(0x80), false).await?,
|
||||
true
|
||||
);
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(0x80), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?,
|
||||
relsize
|
||||
);
|
||||
// Check relation content
|
||||
@@ -1544,7 +1485,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
|
||||
.no_ondemand_download()?,
|
||||
.await?,
|
||||
TEST_IMG(&data)
|
||||
);
|
||||
}
|
||||
@@ -1574,21 +1515,19 @@ mod tests {
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(lsn), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
|
||||
RELSEG_SIZE + 1
|
||||
);
|
||||
|
||||
// Truncate one block
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)?;
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(lsn), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
|
||||
RELSEG_SIZE
|
||||
);
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
@@ -1596,12 +1535,12 @@ mod tests {
|
||||
// Truncate another block
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)?;
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(lsn), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
|
||||
RELSEG_SIZE - 1
|
||||
);
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
@@ -1612,12 +1551,12 @@ mod tests {
|
||||
while size >= 0 {
|
||||
lsn += 0x10;
|
||||
let mut m = tline.begin_modification(Lsn(lsn));
|
||||
walingest.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)?;
|
||||
walingest
|
||||
.put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)
|
||||
.await?;
|
||||
m.commit()?;
|
||||
assert_eq!(
|
||||
tline
|
||||
.get_rel_size(TESTREL_A, Lsn(lsn), false)
|
||||
.no_ondemand_download()?,
|
||||
tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
|
||||
size as BlockNumber
|
||||
);
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@ use once_cell::sync::OnceCell;
|
||||
use std::future::Future;
|
||||
use storage_broker::BrokerClientChannel;
|
||||
use tokio::sync::watch;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
pub use connection_manager::spawn_connection_manager_task;
|
||||
@@ -76,7 +77,7 @@ pub fn is_broker_client_initialized() -> bool {
|
||||
|
||||
/// A handle of an asynchronous task.
|
||||
/// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`]
|
||||
/// and a cancellation channel that it can listen to for earlier interrupts.
|
||||
/// and a cancellation token that it can listen to for earlier interrupts.
|
||||
///
|
||||
/// Note that the communication happens via the `watch` channel, that does not accumulate the events, replacing the old one with the never one on submission.
|
||||
/// That may lead to certain events not being observed by the listener.
|
||||
@@ -84,7 +85,7 @@ pub fn is_broker_client_initialized() -> bool {
|
||||
pub struct TaskHandle<E> {
|
||||
join_handle: Option<tokio::task::JoinHandle<anyhow::Result<()>>>,
|
||||
events_receiver: watch::Receiver<TaskStateUpdate<E>>,
|
||||
cancellation: watch::Sender<()>,
|
||||
cancellation: CancellationToken,
|
||||
}
|
||||
|
||||
pub enum TaskEvent<E> {
|
||||
@@ -102,20 +103,23 @@ pub enum TaskStateUpdate<E> {
|
||||
impl<E: Clone> TaskHandle<E> {
|
||||
/// Initializes the task, starting it immediately after the creation.
|
||||
pub fn spawn<Fut>(
|
||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, watch::Receiver<()>) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, CancellationToken) -> Fut + Send + 'static,
|
||||
) -> Self
|
||||
where
|
||||
Fut: Future<Output = anyhow::Result<()>> + Send,
|
||||
E: Send + Sync + 'static,
|
||||
{
|
||||
let (cancellation, cancellation_receiver) = watch::channel(());
|
||||
let cancellation = CancellationToken::new();
|
||||
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
||||
|
||||
let cancellation_clone = cancellation.clone();
|
||||
let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
|
||||
events_sender.send(TaskStateUpdate::Started).ok();
|
||||
task(events_sender, cancellation_receiver).await
|
||||
task(events_sender, cancellation_clone).await
|
||||
// events_sender is dropped at some point during the .await above.
|
||||
// But the task is still running on WALRECEIVER_RUNTIME.
|
||||
// That is the window when `!jh.is_finished()`
|
||||
// is true inside `fn next_task_event()` below.
|
||||
});
|
||||
|
||||
TaskHandle {
|
||||
@@ -132,7 +136,23 @@ impl<E: Clone> TaskHandle<E> {
|
||||
TaskEvent::End(match self.join_handle.as_mut() {
|
||||
Some(jh) => {
|
||||
if !jh.is_finished() {
|
||||
warn!("sender is dropped while join handle is still alive");
|
||||
// Barring any implementation errors in this module, we can
|
||||
// only arrive here while the task that executes the future
|
||||
// passed to `Self::spawn()` is still execution. Cf the comment
|
||||
// in Self::spawn().
|
||||
//
|
||||
// This was logging at warning level in earlier versions, presumably
|
||||
// to leave some breadcrumbs in case we had an implementation
|
||||
// error that would would make us get stuck in `jh.await`.
|
||||
//
|
||||
// There hasn't been such a bug so far.
|
||||
// But in a busy system, e.g., during pageserver restart,
|
||||
// we arrive here often enough that the warning-level logs
|
||||
// became a distraction.
|
||||
// So, tone them down to info-level.
|
||||
//
|
||||
// XXX: rewrite this module to eliminate the race condition.
|
||||
info!("sender is dropped while join handle is still alive");
|
||||
}
|
||||
|
||||
let res = jh
|
||||
@@ -157,7 +177,7 @@ impl<E: Clone> TaskHandle<E> {
|
||||
/// Aborts current task, waiting for it to finish.
|
||||
pub async fn shutdown(self) {
|
||||
if let Some(jh) = self.join_handle {
|
||||
self.cancellation.send(()).ok();
|
||||
self.cancellation.cancel();
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
|
||||
@@ -19,6 +19,7 @@ use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::{pin, select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
|
||||
@@ -59,7 +60,7 @@ pub async fn handle_walreceiver_connection(
|
||||
timeline: Arc<Timeline>,
|
||||
wal_source_connconf: PgConnectionConfig,
|
||||
events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
|
||||
mut cancellation: watch::Receiver<()>,
|
||||
cancellation: CancellationToken,
|
||||
connect_timeout: Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
// Connect to the database in replication mode.
|
||||
@@ -98,7 +99,7 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
// The connection object performs the actual communication with the database,
|
||||
// so spawn it off to run on its own.
|
||||
let mut connection_cancellation = cancellation.clone();
|
||||
let connection_cancellation = cancellation.clone();
|
||||
task_mgr::spawn(
|
||||
WALRECEIVER_RUNTIME.handle(),
|
||||
TaskKind::WalReceiverConnection,
|
||||
@@ -117,7 +118,7 @@ pub async fn handle_walreceiver_connection(
|
||||
}
|
||||
},
|
||||
|
||||
_ = connection_cancellation.changed() => info!("Connection cancelled"),
|
||||
_ = connection_cancellation.cancelled() => info!("Connection cancelled"),
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
@@ -183,7 +184,7 @@ pub async fn handle_walreceiver_connection(
|
||||
|
||||
while let Some(replication_message) = {
|
||||
select! {
|
||||
_ = cancellation.changed() => {
|
||||
_ = cancellation.cancelled() => {
|
||||
info!("walreceiver interrupted");
|
||||
None
|
||||
}
|
||||
|
||||
@@ -1010,3 +1010,110 @@ fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{PostgresRedoManager, WalRedoManager};
|
||||
use crate::repository::Key;
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
use bytes::Bytes;
|
||||
use std::str::FromStr;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
#[test]
|
||||
fn short_v14_redo() {
|
||||
let expected = std::fs::read("fixtures/short_v14_redo.page").unwrap();
|
||||
|
||||
let h = RedoHarness::new().unwrap();
|
||||
|
||||
let page = h
|
||||
.manager
|
||||
.request_redo(
|
||||
Key {
|
||||
field1: 0,
|
||||
field2: 1663,
|
||||
field3: 13010,
|
||||
field4: 1259,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
},
|
||||
Lsn::from_str("0/16E2408").unwrap(),
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(&expected, &*page);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn short_v14_fails_for_wrong_key_but_returns_zero_page() {
|
||||
let h = RedoHarness::new().unwrap();
|
||||
|
||||
let page = h
|
||||
.manager
|
||||
.request_redo(
|
||||
Key {
|
||||
field1: 0,
|
||||
field2: 1663,
|
||||
// key should be 13010
|
||||
field3: 13130,
|
||||
field4: 1259,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
},
|
||||
Lsn::from_str("0/16E2408").unwrap(),
|
||||
None,
|
||||
short_records(),
|
||||
14,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// TODO: there will be some stderr printout, which is forwarded to tracing that could
|
||||
// perhaps be captured as long as it's in the same thread.
|
||||
assert_eq!(page, crate::ZERO_PAGE);
|
||||
}
|
||||
|
||||
#[allow(clippy::octal_escapes)]
|
||||
fn short_records() -> Vec<(Lsn, NeonWalRecord)> {
|
||||
vec![
|
||||
(
|
||||
Lsn::from_str("0/16A9388").unwrap(),
|
||||
NeonWalRecord::Postgres {
|
||||
will_init: true,
|
||||
rec: Bytes::from_static(b"j\x03\0\0\0\x04\0\0\xe8\x7fj\x01\0\0\0\0\0\n\0\0\xd0\x16\x13Y\0\x10\0\04\x03\xd4\0\x05\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x03\0\0\0\0\x80\xeca\x01\0\0\x01\0\xd4\0\xa0\x1d\0 \x04 \0\0\0\0/\0\x01\0\xa0\x9dX\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0.\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\00\x9f\x9a\x01P\x9e\xb2\x01\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0!\0\x01\x08 \xff\xff\xff?\0\0\0\0\0\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\0\0\0\0\0\0\x80\xbf\0\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\0\0\0\0\x0c\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0/\0!\x80\x03+ \xff\xff\xff\x7f\0\0\0\0\0\xdf\x04\0\0pg_type\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0G\0\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\x0e\0\0\0\0@\x16D\x0e\0\0\0K\x10\0\0\x01\0pr \0\0\0\0\0\0\0\0\x01n\0\0\0\0\0\xd6\x02\0\0\x01\0\0\0[\x01\0\0\0\0\0\0\0\t\x04\0\0\x02\0\0\0\x01\0\0\0\n\0\0\0\n\0\0\0\x7f\0\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0\0\0C\x01\0\0\x15\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0.\0!\x80\x03+ \xff\xff\xff\x7f\0\0\0\0\0;\n\0\0pg_statistic\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xfd.\0\0\0\0\0\0\n\0\0\0\x02\0\0\0;\n\0\0\0\0\0\0\x13\0\0\0\0\0\xcbC\x13\0\0\0\x18\x0b\0\0\x01\0pr\x1f\0\0\0\0\0\0\0\0\x01n\0\0\0\0\0\xd6\x02\0\0\x01\0\0\0C\x01\0\0\0\0\0\0\0\t\x04\0\0\x01\0\0\0\x01\0\0\0\n\0\0\0\n\0\0\0\x7f\0\0\0\0\0\0\x02\0\x01")
|
||||
}
|
||||
),
|
||||
(
|
||||
Lsn::from_str("0/16D4080").unwrap(),
|
||||
NeonWalRecord::Postgres {
|
||||
will_init: false,
|
||||
rec: Bytes::from_static(b"\xbc\0\0\0\0\0\0\0h?m\x01\0\0\0\0p\n\0\09\x08\xa3\xea\0 \x8c\0\x7f\x06\0\0\xd22\0\0\xeb\x04\0\0\0\0\0\0\xff\x02\0@\0\0another_table\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x98\x08\0\0\x02@\0\0\0\0\0\0\n\0\0\0\x02\0\0\0\0@\0\0\0\0\0\0\x05\0\0\0\0@zD\x05\0\0\0\0\0\0\0\0\0pr\x01\0\0\0\0\0\0\0\0\x01d\0\0\0\0\0\0\x04\0\0\x01\0\0\0\x02\0")
|
||||
}
|
||||
)
|
||||
]
|
||||
}
|
||||
|
||||
struct RedoHarness {
|
||||
// underscored because unused, except for removal at drop
|
||||
_repo_dir: tempfile::TempDir,
|
||||
manager: PostgresRedoManager,
|
||||
}
|
||||
|
||||
impl RedoHarness {
|
||||
fn new() -> anyhow::Result<Self> {
|
||||
let repo_dir = tempfile::tempdir()?;
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_id = TenantId::generate();
|
||||
|
||||
let manager = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
Ok(RedoHarness {
|
||||
_repo_dir: repo_dir,
|
||||
manager,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ typedef struct
|
||||
#define NEON_TAG "[NEON_SMGR] "
|
||||
#define neon_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true)))
|
||||
errhidestmt(true), errhidecontext(true), internalerrposition(0)))
|
||||
|
||||
/*
|
||||
* supertype of all the Neon*Request structs below
|
||||
|
||||
@@ -52,6 +52,7 @@
|
||||
#include "access/xlogdefs.h"
|
||||
#include "catalog/pg_class.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "executor/instrument.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "postmaster/interrupt.h"
|
||||
#include "postmaster/autovacuum.h"
|
||||
@@ -250,11 +251,6 @@ PrefetchState *MyPState;
|
||||
) \
|
||||
)
|
||||
|
||||
int n_prefetch_hits = 0;
|
||||
int n_prefetch_misses = 0;
|
||||
int n_prefetch_missed_caches = 0;
|
||||
int n_prefetch_dupes = 0;
|
||||
|
||||
XLogRecPtr prefetch_lsn = 0;
|
||||
|
||||
static bool compact_prefetch_buffers(void);
|
||||
@@ -291,12 +287,13 @@ compact_prefetch_buffers(void)
|
||||
|
||||
/*
|
||||
* Here we have established:
|
||||
* slots < search_ring_index may be unused (not scanned)
|
||||
* slots >= search_ring_index and <= empty_ring_index are unused
|
||||
* slots > empty_ring_index are in use, or outside our buffer's range.
|
||||
* slots < search_ring_index have an unknown state (not scanned)
|
||||
* slots >= search_ring_index and <= empty_ring_index are unused
|
||||
* slots > empty_ring_index are in use, or outside our buffer's range.
|
||||
* ... unless search_ring_index <= ring_last
|
||||
*
|
||||
* Therefore, there is a gap of at least one unused items between
|
||||
* search_ring_index and empty_ring_index, which grows as we hit
|
||||
* search_ring_index and empty_ring_index (both inclusive), which grows as we hit
|
||||
* more unused items while moving backwards through the array.
|
||||
*/
|
||||
|
||||
@@ -306,6 +303,7 @@ compact_prefetch_buffers(void)
|
||||
PrefetchRequest *target_slot;
|
||||
bool found;
|
||||
|
||||
/* update search index to an unprocessed entry */
|
||||
search_ring_index--;
|
||||
|
||||
source_slot = GetPrfSlot(search_ring_index);
|
||||
@@ -313,6 +311,7 @@ compact_prefetch_buffers(void)
|
||||
if (source_slot->status == PRFS_UNUSED)
|
||||
continue;
|
||||
|
||||
/* slot is used -- start moving slot */
|
||||
target_slot = GetPrfSlot(empty_ring_index);
|
||||
|
||||
Assert(source_slot->status == PRFS_RECEIVED);
|
||||
@@ -332,16 +331,22 @@ compact_prefetch_buffers(void)
|
||||
/* Adjust the location of our known-empty slot */
|
||||
empty_ring_index--;
|
||||
|
||||
/* empty the moved slot */
|
||||
source_slot->status = PRFS_UNUSED;
|
||||
source_slot->buftag = (BufferTag) {0};
|
||||
source_slot->response = NULL;
|
||||
source_slot->my_ring_index = 0;
|
||||
source_slot->effective_request_lsn = 0;
|
||||
|
||||
/* update bookkeeping */
|
||||
n_moved++;
|
||||
}
|
||||
|
||||
if (MyPState->ring_last != empty_ring_index)
|
||||
/*
|
||||
* Only when we've moved slots we can expect trailing unused slots,
|
||||
* so only then we clean up trailing unused slots.
|
||||
*/
|
||||
if (n_moved > 0)
|
||||
{
|
||||
prefetch_cleanup_trailing_unused();
|
||||
return true;
|
||||
@@ -770,7 +775,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
|
||||
else
|
||||
{
|
||||
/* The buffered request is good enough, return that index */
|
||||
n_prefetch_dupes++;
|
||||
pgBufferUsage.prefetch.duplicates++;
|
||||
return ring_index;
|
||||
}
|
||||
}
|
||||
@@ -1845,7 +1850,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (slot->effective_request_lsn >= request_lsn)
|
||||
{
|
||||
ring_index = slot->my_ring_index;
|
||||
n_prefetch_hits += 1;
|
||||
pgBufferUsage.prefetch.hits += 1;
|
||||
}
|
||||
else /* the current prefetch LSN is not large enough, so drop the prefetch */
|
||||
{
|
||||
@@ -1860,7 +1865,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
/* drop caches */
|
||||
prefetch_set_unused(slot->my_ring_index);
|
||||
n_prefetch_missed_caches += 1;
|
||||
pgBufferUsage.prefetch.expired += 1;
|
||||
/* make it look like a prefetch cache miss */
|
||||
entry = NULL;
|
||||
}
|
||||
@@ -1870,7 +1875,7 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
if (entry == NULL)
|
||||
{
|
||||
n_prefetch_misses += 1;
|
||||
pgBufferUsage.prefetch.misses += 1;
|
||||
|
||||
ring_index = prefetch_register_buffer(buftag, &request_latest,
|
||||
&request_lsn);
|
||||
|
||||
105
proxy/Cargo.toml
105
proxy/Cargo.toml
@@ -1,58 +1,63 @@
|
||||
[package]
|
||||
name = "proxy"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
atty = "0.2.14"
|
||||
base64 = "0.13.0"
|
||||
bstr = "1.0"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
clap = "4.0"
|
||||
futures = "0.3.13"
|
||||
git-version = "0.3.5"
|
||||
hashbrown = "0.12"
|
||||
hex = "0.4.3"
|
||||
hmac = "0.12.1"
|
||||
hyper = "0.14"
|
||||
hyper-tungstenite = "0.8.1"
|
||||
itertools = "0.10.3"
|
||||
md5 = "0.7.0"
|
||||
once_cell = "1.13.0"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2.7"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls" ] }
|
||||
routerify = "3"
|
||||
rustls = "0.20.0"
|
||||
rustls-pemfile = "1"
|
||||
scopeguard = "1.1.0"
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
sha2 = "0.10.2"
|
||||
socket2 = "0.4.4"
|
||||
thiserror = "1.0.30"
|
||||
tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
tokio-rustls = "0.23.0"
|
||||
tls-listener = { version = "0.5.1", features = ["rustls", "hyper-h1"] }
|
||||
tracing = "0.1.36"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
url = "2.2.2"
|
||||
uuid = { version = "1.2", features = ["v4", "serde"] }
|
||||
webpki-roots = "0.22.5"
|
||||
x509-parser = "0.14"
|
||||
anyhow.workspace = true
|
||||
atty.workspace = true
|
||||
base64.workspace = true
|
||||
bstr.workspace = true
|
||||
bytes = {workspace = true, features = ['serde'] }
|
||||
clap.workspace = true
|
||||
chrono.workspace = true
|
||||
consumption_metrics.workspace = true
|
||||
futures.workspace = true
|
||||
git-version.workspace = true
|
||||
hashbrown.workspace = true
|
||||
hex.workspace = true
|
||||
hmac.workspace = true
|
||||
hyper.workspace = true
|
||||
hyper-tungstenite.workspace = true
|
||||
itertools.workspace = true
|
||||
md5.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
rand.workspace = true
|
||||
regex.workspace = true
|
||||
reqwest = { workspace = true, features = [ "json" ] }
|
||||
routerify.workspace = true
|
||||
rustls.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
scopeguard.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
sha2.workspace = true
|
||||
socket2.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tls-listener.workspace = true
|
||||
tracing.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
webpki-roots.workspace = true
|
||||
x509-parser.workspace = true
|
||||
metrics.workspace = true
|
||||
pq_proto.workspace = true
|
||||
utils.workspace = true
|
||||
prometheus.workspace = true
|
||||
humantime.workspace = true
|
||||
hostname.workspace = true
|
||||
|
||||
metrics = { path = "../libs/metrics" }
|
||||
pq_proto = { path = "../libs/pq_proto" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
async-trait = "0.1"
|
||||
rcgen = "0.10"
|
||||
rstest = "0.15"
|
||||
tokio-postgres-rustls = "0.9.0"
|
||||
async-trait.workspace = true
|
||||
rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
|
||||
@@ -42,9 +42,9 @@ pub enum AuthErrorImpl {
|
||||
MalformedPassword(&'static str),
|
||||
|
||||
#[error(
|
||||
"Project ID is not specified. \
|
||||
"Endpoint ID is not specified. \
|
||||
Either please upgrade the postgres client library (libpq) for SNI support \
|
||||
or pass the project ID (first part of the domain name) as a parameter: '?options=project%3D<project-id>'. \
|
||||
or pass the endpoint ID (first part of the domain name) as a parameter: '?options=project%3D<endpoint-id>'. \
|
||||
See more at https://neon.tech/sni"
|
||||
)]
|
||||
MissingProjectName,
|
||||
|
||||
@@ -5,6 +5,12 @@ use std::sync::Arc;
|
||||
pub struct ProxyConfig {
|
||||
pub tls_config: Option<TlsConfig>,
|
||||
pub auth_backend: auth::BackendType<'static, ()>,
|
||||
pub metric_collection_config: Option<MetricCollectionConfig>,
|
||||
}
|
||||
|
||||
pub struct MetricCollectionConfig {
|
||||
pub endpoint: reqwest::Url,
|
||||
pub interval: std::time::Duration,
|
||||
}
|
||||
|
||||
pub struct TlsConfig {
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::convert::Infallible;
|
||||
use std::future::ready;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::task::{Context, Poll};
|
||||
use std::task::{ready, Context, Poll};
|
||||
use tls_listener::TlsListener;
|
||||
|
||||
use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
|
||||
@@ -104,10 +104,9 @@ impl AsyncRead for WebSocketRW {
|
||||
return Poll::Ready(Ok(()));
|
||||
}
|
||||
|
||||
let inner_buf = match self.as_mut().poll_fill_buf(cx) {
|
||||
Poll::Ready(Ok(buf)) => buf,
|
||||
Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
let inner_buf = match ready!(self.as_mut().poll_fill_buf(cx)) {
|
||||
Ok(buf) => buf,
|
||||
Err(err) => return Poll::Ready(Err(err)),
|
||||
};
|
||||
let len = std::cmp::min(inner_buf.len(), buf.remaining());
|
||||
buf.put_slice(&inner_buf[..len]);
|
||||
@@ -124,8 +123,8 @@ impl AsyncBufRead for WebSocketRW {
|
||||
let buf = self.project().chunk.as_ref().unwrap().chunk();
|
||||
return Poll::Ready(Ok(buf));
|
||||
} else {
|
||||
match self.as_mut().project().stream.poll_next(cx) {
|
||||
Poll::Ready(Some(Ok(message))) => match message {
|
||||
match ready!(self.as_mut().project().stream.poll_next(cx)) {
|
||||
Some(Ok(message)) => match message {
|
||||
Message::Text(_) => {}
|
||||
Message::Binary(chunk) => {
|
||||
*self.as_mut().project().chunk = Some(Bytes::from(chunk));
|
||||
@@ -142,9 +141,8 @@ impl AsyncBufRead for WebSocketRW {
|
||||
unreachable!();
|
||||
}
|
||||
},
|
||||
Poll::Ready(Some(Err(err))) => return Poll::Ready(Err(ws_err_into(err))),
|
||||
Poll::Ready(None) => return Poll::Ready(Ok(&[])),
|
||||
Poll::Pending => return Poll::Pending,
|
||||
Some(Err(err)) => return Poll::Ready(Err(ws_err_into(err))),
|
||||
None => return Poll::Ready(Ok(&[])),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ mod config;
|
||||
mod console;
|
||||
mod error;
|
||||
mod http;
|
||||
mod metrics;
|
||||
mod mgmt;
|
||||
mod parse;
|
||||
mod proxy;
|
||||
@@ -20,14 +21,14 @@ mod stream;
|
||||
mod url;
|
||||
mod waiters;
|
||||
|
||||
use ::metrics::set_build_info_metric;
|
||||
use anyhow::{bail, Context};
|
||||
use clap::{self, Arg};
|
||||
use config::ProxyConfig;
|
||||
use futures::FutureExt;
|
||||
use metrics::set_build_info_metric;
|
||||
use std::{borrow::Cow, future::Future, net::SocketAddr};
|
||||
use tokio::{net::TcpListener, task::JoinError};
|
||||
use tracing::info;
|
||||
use tracing::{info, info_span, Instrument};
|
||||
use utils::project_git_version;
|
||||
use utils::sentry_init::{init_sentry, release_name};
|
||||
|
||||
@@ -65,6 +66,22 @@ async fn main() -> anyhow::Result<()> {
|
||||
let mgmt_address: SocketAddr = arg_matches.get_one::<String>("mgmt").unwrap().parse()?;
|
||||
let http_address: SocketAddr = arg_matches.get_one::<String>("http").unwrap().parse()?;
|
||||
|
||||
let metric_collection_config = match
|
||||
(
|
||||
arg_matches.get_one::<String>("metric-collection-endpoint"),
|
||||
arg_matches.get_one::<String>("metric-collection-interval"),
|
||||
) {
|
||||
|
||||
(Some(endpoint), Some(interval)) => {
|
||||
Some(config::MetricCollectionConfig {
|
||||
endpoint: endpoint.parse()?,
|
||||
interval: humantime::parse_duration(interval)?,
|
||||
})
|
||||
}
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither metric-collection-endpoint and metric-collection-interval must be specified"),
|
||||
};
|
||||
|
||||
let auth_backend = match arg_matches
|
||||
.get_one::<String>("auth-backend")
|
||||
.unwrap()
|
||||
@@ -95,6 +112,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
let config: &ProxyConfig = Box::leak(Box::new(ProxyConfig {
|
||||
tls_config,
|
||||
auth_backend,
|
||||
metric_collection_config,
|
||||
}));
|
||||
|
||||
info!("Version: {GIT_VERSION}");
|
||||
@@ -126,6 +144,21 @@ async fn main() -> anyhow::Result<()> {
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(metric_collection_config) = &config.metric_collection_config {
|
||||
let hostname = hostname::get()?
|
||||
.into_string()
|
||||
.map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
|
||||
|
||||
tasks.push(tokio::spawn(
|
||||
metrics::collect_metrics(
|
||||
&metric_collection_config.endpoint,
|
||||
metric_collection_config.interval,
|
||||
hostname,
|
||||
)
|
||||
.instrument(info_span!("collect_metrics")),
|
||||
));
|
||||
}
|
||||
|
||||
let tasks = tasks.into_iter().map(flatten_err);
|
||||
|
||||
set_build_info_metric(GIT_VERSION);
|
||||
@@ -199,6 +232,16 @@ fn cli() -> clap::Command {
|
||||
.alias("ssl-cert") // backwards compatibility
|
||||
.help("path to TLS cert for client postgres connections"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("metric-collection-endpoint")
|
||||
.long("metric-collection-endpoint")
|
||||
.help("metric collection HTTP endpoint"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("metric-collection-interval")
|
||||
.long("metric-collection-interval")
|
||||
.help("metric collection interval"),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
196
proxy/src/metrics.rs
Normal file
196
proxy/src/metrics.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
//!
|
||||
//! Periodically collect proxy consumption metrics
|
||||
//! and push them to a HTTP endpoint.
|
||||
//!
|
||||
use chrono::{DateTime, Utc};
|
||||
use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
|
||||
use serde::Serialize;
|
||||
use std::{collections::HashMap, time::Duration};
|
||||
use tracing::{debug, error, log::info, trace};
|
||||
|
||||
const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
|
||||
|
||||
///
|
||||
/// Key that uniquely identifies the object, this metric describes.
|
||||
/// Currently, endpoint_id is enough, but this may change later,
|
||||
/// so keep it in a named struct.
|
||||
///
|
||||
/// Both the proxy and the ingestion endpoint will live in the same region (or cell)
|
||||
/// so while the project-id is unique across regions the whole pipeline will work correctly
|
||||
/// because we enrich the event with project_id in the control-plane endpoint.
|
||||
///
|
||||
#[derive(Eq, Hash, PartialEq, Serialize)]
|
||||
pub struct Ids {
|
||||
pub endpoint_id: String,
|
||||
}
|
||||
|
||||
pub async fn collect_metrics(
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
metric_collection_interval: Duration,
|
||||
hostname: String,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
info!("collect_metrics has shut down");
|
||||
}
|
||||
|
||||
let mut ticker = tokio::time::interval(metric_collection_interval);
|
||||
|
||||
info!(
|
||||
"starting collect_metrics. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
// define client here to reuse it for all requests
|
||||
let client = reqwest::Client::new();
|
||||
let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = ticker.tick() => {
|
||||
|
||||
match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
|
||||
{
|
||||
Err(e) => {
|
||||
error!("Failed to send consumption metrics: {} ", e);
|
||||
},
|
||||
Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
|
||||
let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
|
||||
let metrics = prometheus::default_registry().gather();
|
||||
|
||||
for m in metrics {
|
||||
if m.get_name() == "proxy_io_bytes_per_client" {
|
||||
for ms in m.get_metric() {
|
||||
let direction = ms
|
||||
.get_label()
|
||||
.iter()
|
||||
.find(|l| l.get_name() == "direction")
|
||||
.unwrap()
|
||||
.get_value();
|
||||
|
||||
// Only collect metric for outbound traffic
|
||||
if direction == "tx" {
|
||||
let endpoint_id = ms
|
||||
.get_label()
|
||||
.iter()
|
||||
.find(|l| l.get_name() == "endpoint_id")
|
||||
.unwrap()
|
||||
.get_value();
|
||||
let value = ms.get_counter().get_value() as u64;
|
||||
|
||||
debug!("endpoint_id:val - {}: {}", endpoint_id, value);
|
||||
current_metrics.push((
|
||||
Ids {
|
||||
endpoint_id: endpoint_id.to_string(),
|
||||
},
|
||||
(value, Utc::now()),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
current_metrics
|
||||
}
|
||||
|
||||
pub async fn collect_metrics_iteration(
|
||||
client: &reqwest::Client,
|
||||
cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
hostname: String,
|
||||
) -> anyhow::Result<()> {
|
||||
info!(
|
||||
"starting collect_metrics_iteration. metric_collection_endpoint: {}",
|
||||
metric_collection_endpoint
|
||||
);
|
||||
|
||||
let current_metrics = gather_proxy_io_bytes_per_client();
|
||||
|
||||
let metrics_to_send: Vec<Event<Ids>> = current_metrics
|
||||
.iter()
|
||||
.filter_map(|(curr_key, (curr_val, curr_time))| {
|
||||
let mut start_time = *curr_time;
|
||||
let mut value = *curr_val;
|
||||
|
||||
if let Some((prev_val, prev_time)) = cached_metrics.get(curr_key) {
|
||||
// Only send metrics updates if the metric has changed
|
||||
if curr_val - prev_val > 0 {
|
||||
value = curr_val - prev_val;
|
||||
start_time = *prev_time;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
Some(Event {
|
||||
kind: EventType::Incremental {
|
||||
start_time,
|
||||
stop_time: *curr_time,
|
||||
},
|
||||
metric: PROXY_IO_BYTES_PER_CLIENT,
|
||||
idempotency_key: idempotency_key(hostname.clone()),
|
||||
value,
|
||||
extra: Ids {
|
||||
endpoint_id: curr_key.endpoint_id.clone(),
|
||||
},
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
if metrics_to_send.is_empty() {
|
||||
trace!("no new metrics to send");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Send metrics.
|
||||
// Split into chunks of 1000 metrics to avoid exceeding the max request size
|
||||
for chunk in metrics_to_send.chunks(CHUNK_SIZE) {
|
||||
let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk })
|
||||
.expect("ProxyConsumptionMetric should not fail serialization");
|
||||
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.json(&chunk_json)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let res = match res {
|
||||
Ok(x) => x,
|
||||
Err(err) => {
|
||||
error!("failed to send metrics: {:?}", err);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if res.status().is_success() {
|
||||
// update cached metrics after they were sent successfully
|
||||
for send_metric in chunk {
|
||||
let stop_time = match send_metric.kind {
|
||||
EventType::Incremental { stop_time, .. } => stop_time,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
cached_metrics
|
||||
.entry(Ids {
|
||||
endpoint_id: send_metric.extra.endpoint_id.clone(),
|
||||
})
|
||||
// update cached value (add delta) and time
|
||||
.and_modify(|e| {
|
||||
e.0 += send_metric.value;
|
||||
e.1 = stop_time
|
||||
})
|
||||
// cache new metric
|
||||
.or_insert((send_metric.value, stop_time));
|
||||
}
|
||||
} else {
|
||||
error!("metrics endpoint refused the sent metrics: {:?}", res);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,10 +1,5 @@
|
||||
[toolchain]
|
||||
# We try to stick to a toolchain version that is widely available on popular distributions, so that most people
|
||||
# can use the toolchain that comes with their operating system. But if there's a feature we miss badly from a later
|
||||
# version, we can consider updating.
|
||||
# See https://tracker.debian.org/pkg/rustc for more details on Debian rustc package,
|
||||
# we use "unstable" version number as the highest version used in the project by default.
|
||||
channel = "1.62.1"
|
||||
channel = "1.66.1"
|
||||
profile = "default"
|
||||
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
|
||||
# https://rust-lang.github.io/rustup/concepts/profiles.html
|
||||
|
||||
@@ -1,48 +1,48 @@
|
||||
[package]
|
||||
name = "safekeeper"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
async-stream = "0.3"
|
||||
anyhow = "1.0"
|
||||
async-trait = "0.1"
|
||||
byteorder = "1.4.3"
|
||||
bytes = "1.0.1"
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
const_format = "0.2.21"
|
||||
crc32c = "0.6.0"
|
||||
fs2 = "0.4.3"
|
||||
git-version = "0.3.5"
|
||||
hex = "0.4.3"
|
||||
humantime = "2.1.0"
|
||||
hyper = "0.14"
|
||||
nix = "0.25"
|
||||
once_cell = "1.13.0"
|
||||
parking_lot = "0.12.1"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
regex = "1.4.5"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "2.0"
|
||||
signal-hook = "0.3.10"
|
||||
thiserror = "1"
|
||||
tokio = { version = "1.17", features = ["macros", "fs"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
|
||||
toml_edit = { version = "0.14", features = ["easy"] }
|
||||
tracing = "0.1.27"
|
||||
url = "2.2.2"
|
||||
async-stream.workspace = true
|
||||
anyhow.workspace = true
|
||||
async-trait.workspace = true
|
||||
byteorder.workspace = true
|
||||
bytes.workspace = true
|
||||
clap = { workspace = true, features = ["derive"] }
|
||||
const_format.workspace = true
|
||||
crc32c.workspace = true
|
||||
fs2.workspace = true
|
||||
git-version.workspace = true
|
||||
hex.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
nix.workspace = true
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
postgres.workspace = true
|
||||
postgres-protocol.workspace = true
|
||||
regex.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
serde_with.workspace = true
|
||||
signal-hook.workspace = true
|
||||
thiserror.workspace = true
|
||||
tokio = { workspace = true, features = ["fs"] }
|
||||
tokio-postgres.workspace = true
|
||||
toml_edit.workspace = true
|
||||
tracing.workspace = true
|
||||
url.workspace = true
|
||||
metrics.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
pq_proto.workspace = true
|
||||
remote_storage.workspace = true
|
||||
safekeeper_api.workspace = true
|
||||
storage_broker.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
metrics = { path = "../libs/metrics" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
pq_proto = { path = "../libs/pq_proto" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
safekeeper_api = { path = "../libs/safekeeper_api" }
|
||||
storage_broker = { version = "0.1", path = "../storage_broker" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2"
|
||||
tempfile.workspace = true
|
||||
|
||||
@@ -466,7 +466,7 @@ impl Timeline {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => {
|
||||
// Bootstrap failed, cancel timeline and remove timeline directory.
|
||||
self.cancel();
|
||||
self.cancel(shared_state);
|
||||
|
||||
if let Err(fs_err) = std::fs::remove_dir_all(&self.timeline_dir) {
|
||||
warn!(
|
||||
@@ -487,20 +487,23 @@ impl Timeline {
|
||||
shared_state: &mut MutexGuard<SharedState>,
|
||||
) -> Result<(bool, bool)> {
|
||||
let was_active = shared_state.active;
|
||||
self.cancel();
|
||||
self.cancel(shared_state);
|
||||
let dir_existed = delete_dir(&self.timeline_dir)?;
|
||||
Ok((dir_existed, was_active))
|
||||
}
|
||||
|
||||
/// Cancel timeline to prevent further usage. Background tasks will stop
|
||||
/// eventually after receiving cancellation signal.
|
||||
fn cancel(&self) {
|
||||
info!("Timeline {} is cancelled", self.ttid);
|
||||
fn cancel(&self, shared_state: &mut MutexGuard<SharedState>) {
|
||||
info!("timeline {} is cancelled", self.ttid);
|
||||
let _ = self.cancellation_tx.send(true);
|
||||
let res = self.wal_backup_launcher_tx.blocking_send(self.ttid);
|
||||
if let Err(e) = res {
|
||||
error!("Failed to send stop signal to wal_backup_launcher: {}", e);
|
||||
}
|
||||
// Close associated FDs. Nobody will be able to touch timeline data once
|
||||
// it is cancelled, so WAL storage won't be opened again.
|
||||
shared_state.sk.wal_store.close();
|
||||
}
|
||||
|
||||
/// Returns if timeline is cancelled.
|
||||
@@ -537,10 +540,6 @@ impl Timeline {
|
||||
/// De-register compute connection, shutting down timeline activity if
|
||||
/// pageserver doesn't need catchup.
|
||||
pub fn on_compute_disconnect(&self) -> Result<()> {
|
||||
if self.is_cancelled() {
|
||||
bail!(TimelineError::Cancelled(self.ttid));
|
||||
}
|
||||
|
||||
let is_wal_backup_action_pending: bool;
|
||||
{
|
||||
let mut shared_state = self.write_shared_state();
|
||||
|
||||
@@ -279,7 +279,9 @@ impl GlobalTimelines {
|
||||
let (dir_existed, was_active) = timeline.delete_from_disk(&mut shared_state)?;
|
||||
|
||||
// Remove timeline from the map.
|
||||
TIMELINES_STATE.lock().unwrap().timelines.remove(ttid);
|
||||
// FIXME: re-enable it once we fix the issue with recreation of deleted timelines
|
||||
// https://github.com/neondatabase/neon/issues/3146
|
||||
// TIMELINES_STATE.lock().unwrap().timelines.remove(ttid);
|
||||
|
||||
Ok(TimelineDeleteForceResult {
|
||||
dir_existed,
|
||||
@@ -346,15 +348,16 @@ impl GlobalTimelines {
|
||||
.tenant_dir(tenant_id),
|
||||
)?;
|
||||
|
||||
let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
|
||||
if !tlis_after_delete.is_empty() {
|
||||
// Some timelines were created while we were deleting them, returning error
|
||||
// to the caller, so it can retry later.
|
||||
bail!(
|
||||
"failed to delete all timelines for tenant {}: some timelines were created while we were deleting them",
|
||||
tenant_id
|
||||
);
|
||||
}
|
||||
// FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
|
||||
// let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
|
||||
// if !tlis_after_delete.is_empty() {
|
||||
// // Some timelines were created while we were deleting them, returning error
|
||||
// // to the caller, so it can retry later.
|
||||
// bail!(
|
||||
// "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them",
|
||||
// tenant_id
|
||||
// );
|
||||
// }
|
||||
|
||||
Ok(deleted)
|
||||
}
|
||||
|
||||
@@ -55,6 +55,12 @@ pub trait Storage {
|
||||
/// that without timeline lock.
|
||||
fn remove_up_to(&self) -> Box<dyn Fn(XLogSegNo) -> Result<()>>;
|
||||
|
||||
/// Release resources associated with the storage -- technically, close FDs.
|
||||
/// Currently we don't remove timelines until restart (#3146), so need to
|
||||
/// spare descriptors. This would be useful for temporary tli detach as
|
||||
/// well.
|
||||
fn close(&mut self) {}
|
||||
|
||||
/// Get metrics for this timeline.
|
||||
fn get_metrics(&self) -> WalStorageMetrics;
|
||||
}
|
||||
@@ -401,6 +407,11 @@ impl Storage for PhysicalStorage {
|
||||
})
|
||||
}
|
||||
|
||||
fn close(&mut self) {
|
||||
// close happens in destructor
|
||||
let _open_file = self.file.take();
|
||||
}
|
||||
|
||||
fn get_metrics(&self) -> WalStorageMetrics {
|
||||
self.metrics.clone()
|
||||
}
|
||||
|
||||
@@ -1,38 +1,38 @@
|
||||
[package]
|
||||
name = "storage_broker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
bench = []
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
async-stream = "0.3"
|
||||
bytes = "1.0"
|
||||
clap = { version = "4.0", features = ["derive"] }
|
||||
const_format = "0.2.21"
|
||||
futures = "0.3"
|
||||
futures-core = "0.3"
|
||||
futures-util = "0.3"
|
||||
git-version = "0.3.5"
|
||||
humantime = "2.1.0"
|
||||
hyper = {version = "0.14.14", features = ["full"]}
|
||||
once_cell = "1.13.0"
|
||||
parking_lot = "0.12"
|
||||
prost = "0.11"
|
||||
tonic = {version = "0.8", features = ["tls", "tls-roots"]}
|
||||
tokio = { version = "1.0", features = ["macros", "rt-multi-thread"] }
|
||||
tokio-stream = "0.1"
|
||||
tracing = "0.1.27"
|
||||
anyhow.workspace = true
|
||||
async-stream.workspace = true
|
||||
bytes.workspace = true
|
||||
clap = { workspace = true, features = ["derive"] }
|
||||
const_format.workspace = true
|
||||
futures.workspace = true
|
||||
futures-core.workspace = true
|
||||
futures-util.workspace = true
|
||||
git-version.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper = { workspace = true, features = ["full"] }
|
||||
once_cell.workspace = true
|
||||
parking_lot.workspace = true
|
||||
prost.workspace = true
|
||||
tonic.workspace = true
|
||||
tokio = { workspace = true, features = ["rt-multi-thread"] }
|
||||
tokio-stream.workspace = true
|
||||
tracing.workspace = true
|
||||
metrics.workspace = true
|
||||
utils.workspace = true
|
||||
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
workspace_hack.workspace = true
|
||||
|
||||
[build-dependencies]
|
||||
tonic-build = "0.8"
|
||||
tonic-build.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "rps"
|
||||
|
||||
@@ -22,6 +22,7 @@ from itertools import chain, product
|
||||
from pathlib import Path
|
||||
from types import TracebackType
|
||||
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import asyncpg
|
||||
import backoff # type: ignore
|
||||
@@ -1350,11 +1351,18 @@ class PageserverHttpClient(requests.Session):
|
||||
assert res_json is None
|
||||
|
||||
def timeline_spawn_download_remote_layers(
|
||||
self, tenant_id: TenantId, timeline_id: TimelineId
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
) -> dict[str, Any]:
|
||||
|
||||
body = {
|
||||
"max_concurrent_downloads": max_concurrent_downloads,
|
||||
}
|
||||
res = self.post(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
|
||||
json=body,
|
||||
)
|
||||
self.verbose_error(res)
|
||||
res_json = res.json()
|
||||
@@ -1388,10 +1396,13 @@ class PageserverHttpClient(requests.Session):
|
||||
self,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
max_concurrent_downloads: int,
|
||||
errors_ok=False,
|
||||
at_least_one_download=True,
|
||||
):
|
||||
res = self.timeline_spawn_download_remote_layers(tenant_id, timeline_id)
|
||||
res = self.timeline_spawn_download_remote_layers(
|
||||
tenant_id, timeline_id, max_concurrent_downloads
|
||||
)
|
||||
while True:
|
||||
completed = self.timeline_poll_download_remote_layers_status(
|
||||
tenant_id, timeline_id, res, poll_state="Completed"
|
||||
@@ -2323,6 +2334,8 @@ class NeonProxy(PgProtocol):
|
||||
http_port: int,
|
||||
mgmt_port: int,
|
||||
auth_backend: NeonProxy.AuthBackend,
|
||||
metric_collection_endpoint: Optional[str] = None,
|
||||
metric_collection_interval: Optional[str] = None,
|
||||
):
|
||||
host = "127.0.0.1"
|
||||
super().__init__(dsn=auth_backend.default_conn_url, host=host, port=proxy_port)
|
||||
@@ -2333,6 +2346,8 @@ class NeonProxy(PgProtocol):
|
||||
self.proxy_port = proxy_port
|
||||
self.mgmt_port = mgmt_port
|
||||
self.auth_backend = auth_backend
|
||||
self.metric_collection_endpoint = metric_collection_endpoint
|
||||
self.metric_collection_interval = metric_collection_interval
|
||||
self._popen: Optional[subprocess.Popen[bytes]] = None
|
||||
|
||||
def start(self) -> NeonProxy:
|
||||
@@ -2344,6 +2359,16 @@ class NeonProxy(PgProtocol):
|
||||
*["--mgmt", f"{self.host}:{self.mgmt_port}"],
|
||||
*self.auth_backend.extra_args(),
|
||||
]
|
||||
|
||||
if (
|
||||
self.metric_collection_endpoint is not None
|
||||
and self.metric_collection_interval is not None
|
||||
):
|
||||
args += [
|
||||
*["--metric-collection-endpoint", self.metric_collection_endpoint],
|
||||
*["--metric-collection-interval", self.metric_collection_interval],
|
||||
]
|
||||
|
||||
self._popen = subprocess.Popen(args)
|
||||
self._wait_until_ready()
|
||||
return self
|
||||
@@ -2357,6 +2382,25 @@ class NeonProxy(PgProtocol):
|
||||
request_result.raise_for_status()
|
||||
return request_result.text
|
||||
|
||||
@staticmethod
|
||||
def get_session_id(uri_prefix, uri_line):
|
||||
assert uri_prefix in uri_line
|
||||
|
||||
url_parts = urlparse(uri_line)
|
||||
psql_session_id = url_parts.path[1:]
|
||||
assert psql_session_id.isalnum(), "session_id should only contain alphanumeric chars"
|
||||
|
||||
return psql_session_id
|
||||
|
||||
@staticmethod
|
||||
async def find_auth_link(link_auth_uri, proc):
|
||||
for _ in range(100):
|
||||
line = (await proc.stderr.readline()).decode("utf-8").strip()
|
||||
log.info(f"psql line: {line}")
|
||||
if link_auth_uri in line:
|
||||
log.info(f"SUCCESS, found auth url: {line}")
|
||||
return line
|
||||
|
||||
def __enter__(self) -> NeonProxy:
|
||||
return self
|
||||
|
||||
@@ -2371,6 +2415,46 @@ class NeonProxy(PgProtocol):
|
||||
# it's a child process. This is mostly to clean up in between different tests.
|
||||
self._popen.kill()
|
||||
|
||||
@staticmethod
|
||||
async def activate_link_auth(
|
||||
local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True
|
||||
):
|
||||
|
||||
pg_user = "proxy"
|
||||
|
||||
if create_user:
|
||||
log.info("creating a new user for link auth test")
|
||||
local_vanilla_pg.start()
|
||||
local_vanilla_pg.safe_psql(f"create user {pg_user} with login superuser")
|
||||
|
||||
db_info = json.dumps(
|
||||
{
|
||||
"session_id": psql_session_id,
|
||||
"result": {
|
||||
"Success": {
|
||||
"host": local_vanilla_pg.default_options["host"],
|
||||
"port": local_vanilla_pg.default_options["port"],
|
||||
"dbname": local_vanilla_pg.default_options["dbname"],
|
||||
"user": pg_user,
|
||||
"aux": {
|
||||
"project_id": "test_project_id",
|
||||
"endpoint_id": "test_endpoint_id",
|
||||
"branch_id": "test_branch_id",
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
log.info("sending session activation message")
|
||||
psql = await PSQL(
|
||||
host=proxy_with_metric_collector.host,
|
||||
port=proxy_with_metric_collector.mgmt_port,
|
||||
).run(db_info)
|
||||
assert psql.stdout is not None
|
||||
out = (await psql.stdout.read()).decode("utf-8").strip()
|
||||
assert out == "ok"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def link_proxy(port_distributor: PortDistributor, neon_binpath: Path) -> Iterator[NeonProxy]:
|
||||
|
||||
@@ -144,3 +144,42 @@ def test_tpch(query: LabelledQuery, remote_compare: RemoteCompare):
|
||||
"""
|
||||
|
||||
run_psql(remote_compare, query, times=1)
|
||||
|
||||
|
||||
@pytest.mark.remote_cluster
|
||||
def test_user_examples(remote_compare: RemoteCompare):
|
||||
query = LabelledQuery(
|
||||
"Q1",
|
||||
r"""
|
||||
SELECT
|
||||
v20.c2263 AS v1,
|
||||
v19.c2484 AS v2,
|
||||
DATE_TRUNC('month', v18.c37)::DATE AS v3,
|
||||
(ARRAY_AGG(c1840 order by v18.c37))[1] AS v4,
|
||||
(ARRAY_AGG(c1841 order by v18.c37 DESC))[1] AS v5,
|
||||
SUM(v17.c1843) AS v6,
|
||||
SUM(v17.c1844) AS v7,
|
||||
SUM(v17.c1848) AS v8,
|
||||
SUM(v17.c1845) AS v9,
|
||||
SUM(v17.c1846) AS v10,
|
||||
SUM(v17.c1861) AS v11,
|
||||
SUM(v17.c1860) AS v12,
|
||||
SUM(v17.c1869) AS v13,
|
||||
SUM(v17.c1856) AS v14,
|
||||
SUM(v17.c1855) AS v15,
|
||||
SUM(v17.c1854) AS v16
|
||||
FROM
|
||||
s3.t266 v17
|
||||
INNER JOIN s1.t41 v18 ON v18.c34 = v17.c1836
|
||||
INNER JOIN s3.t571 v19 ON v19.c2482 = v17.c1834
|
||||
INNER JOIN s3.t331 v20 ON v20.c2261 = v17.c1835
|
||||
WHERE
|
||||
(v17.c1835 = 4) AND
|
||||
(v18.c37 >= '2019-03-01') AND
|
||||
(v17.c1833 = 2)
|
||||
GROUP BY v1, v2, v3
|
||||
ORDER BY v1, v2, v3
|
||||
LIMIT 199;
|
||||
""",
|
||||
)
|
||||
run_psql(remote_compare, query, times=3)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM mcr.microsoft.com/dotnet/sdk:6.0 AS build
|
||||
FROM mcr.microsoft.com/dotnet/sdk:7.0 AS build
|
||||
WORKDIR /source
|
||||
|
||||
COPY *.csproj .
|
||||
@@ -7,7 +7,7 @@ RUN dotnet restore
|
||||
COPY . .
|
||||
RUN dotnet publish -c release -o /app --no-restore
|
||||
|
||||
FROM mcr.microsoft.com/dotnet/runtime:6.0
|
||||
FROM mcr.microsoft.com/dotnet/runtime:7.0
|
||||
WORKDIR /app
|
||||
COPY --from=build /app .
|
||||
|
||||
|
||||
@@ -2,13 +2,13 @@
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net6.0</TargetFramework>
|
||||
<TargetFramework>net7.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Npgsql" Version="6.0.5" />
|
||||
<PackageReference Include="Npgsql" Version="7.0.0" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
|
||||
@@ -4,7 +4,7 @@ WORKDIR /source
|
||||
COPY . .
|
||||
|
||||
WORKDIR /app
|
||||
RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.4.0.jar && \
|
||||
RUN curl --output postgresql.jar https://jdbc.postgresql.org/download/postgresql-42.5.1.jar && \
|
||||
javac -d /app /source/Example.java
|
||||
|
||||
CMD ["java", "-cp", "/app/postgresql.jar:.", "Example"]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM python:3.10
|
||||
FROM python:3.11
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
|
||||
@@ -1 +1 @@
|
||||
asyncpg==0.25.0
|
||||
asyncpg==0.27.0
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM python:3.10
|
||||
FROM python:3.11
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
|
||||
@@ -1 +1 @@
|
||||
pg8000==1.29.1
|
||||
pg8000==1.29.4
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
FROM swift:5.6 AS build
|
||||
FROM swift:5.7 AS build
|
||||
RUN apt-get -q update && apt-get -q install -y libssl-dev
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
RUN swift build --configuration release
|
||||
|
||||
FROM swift:5.6
|
||||
FROM swift:5.7
|
||||
WORKDIR /app
|
||||
COPY --from=build /source/.build/release/release .
|
||||
COPY --from=build /source/.build/release .
|
||||
CMD ["/app/PostgresClientKitExample"]
|
||||
|
||||
@@ -3,19 +3,28 @@
|
||||
{
|
||||
"identity" : "bluesocket",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/IBM-Swift/BlueSocket.git",
|
||||
"location" : "https://github.com/Kitura/BlueSocket.git",
|
||||
"state" : {
|
||||
"revision" : "dd924c3bc2c1c144c42b8dda3896f1a03115ded4",
|
||||
"version" : "2.0.2"
|
||||
"revision" : "7b23a867008e0027bfd6f4d398d44720707bc8ca",
|
||||
"version" : "2.0.4"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "bluesslservice",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/IBM-Swift/BlueSSLService",
|
||||
"location" : "https://github.com/Kitura/BlueSSLService",
|
||||
"state" : {
|
||||
"revision" : "c249988fb748749739144e7f554710552acdc0bd",
|
||||
"version" : "2.0.1"
|
||||
"revision" : "b27a94d063962dfa1bba9f79814c4ef202cf33a4",
|
||||
"version" : "2.0.2"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "openssl",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/Kitura/OpenSSL.git",
|
||||
"state" : {
|
||||
"revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2",
|
||||
"version" : "2.3.1"
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -23,17 +32,8 @@
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/codewinsdotcom/PostgresClientKit.git",
|
||||
"state" : {
|
||||
"branch" : "v1.4.3",
|
||||
"revision" : "beafedaea6dc9f04712e9a8547b77f47c406a47e"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "swift-argument-parser",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-argument-parser",
|
||||
"state" : {
|
||||
"revision" : "6b2aa2748a7881eebb9f84fb10c01293e15b52ca",
|
||||
"version" : "0.5.0"
|
||||
"branch" : "v1.5.0",
|
||||
"revision" : "356ffe0c43722f192d796300557b3b530a0baebc"
|
||||
}
|
||||
}
|
||||
],
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
// swift-tools-version:5.6
|
||||
// swift-tools-version:5.7
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
@@ -6,11 +6,11 @@ let package = Package(
|
||||
dependencies: [
|
||||
.package(
|
||||
url: "https://github.com/codewinsdotcom/PostgresClientKit.git",
|
||||
revision: "v1.4.3"
|
||||
revision: "v1.5.0"
|
||||
)
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
.executableTarget(
|
||||
name: "PostgresClientKitExample",
|
||||
dependencies: [ "PostgresClientKit" ])
|
||||
]
|
||||
|
||||
@@ -16,7 +16,7 @@ do {
|
||||
configuration.user = user
|
||||
}
|
||||
if let password = env["NEON_PASSWORD"] {
|
||||
configuration.credential = .scramSHA256(password: password)
|
||||
configuration.credential = .cleartextPassword(password: password)
|
||||
}
|
||||
|
||||
let connection = try PostgresClientKit.Connection(configuration: configuration)
|
||||
|
||||
@@ -14,12 +14,9 @@ from fixtures.utils import subprocess_capture
|
||||
"csharp/npgsql",
|
||||
"java/jdbc",
|
||||
"python/asyncpg",
|
||||
"python/pg8000",
|
||||
pytest.param(
|
||||
"python/pg8000", # See https://github.com/neondatabase/neon/pull/2008#discussion_r912264281
|
||||
marks=pytest.mark.xfail(reason="Handles SSL in incompatible with Neon way"),
|
||||
),
|
||||
pytest.param(
|
||||
"swift/PostgresClientKit", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592
|
||||
"swift/PostgresClientKitExample", # See https://github.com/neondatabase/neon/pull/2008#discussion_r911896592
|
||||
marks=pytest.mark.xfail(reason="Neither SNI nor parameters is supported"),
|
||||
),
|
||||
"typescript/postgresql-client",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
FROM node:16
|
||||
FROM node:18
|
||||
WORKDIR /source
|
||||
|
||||
COPY . .
|
||||
RUN npm clean-install
|
||||
|
||||
CMD ["/source/index.js"]
|
||||
CMD ["/source/index.js"]
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user