mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-02 04:50:38 +00:00
Compare commits
10 Commits
skyzh/pass
...
vlad/perf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c4ce5dda5d | ||
|
|
74c555fd06 | ||
|
|
716bb3c361 | ||
|
|
b90b945c02 | ||
|
|
ac1159cb33 | ||
|
|
74a7f68da0 | ||
|
|
2acca9fe33 | ||
|
|
a2b9ff0d40 | ||
|
|
a7f60dd5d0 | ||
|
|
2a1cbca4e5 |
2
.github/actionlint.yml
vendored
2
.github/actionlint.yml
vendored
@@ -33,5 +33,3 @@ config-variables:
|
||||
- NEON_PROD_AWS_ACCOUNT_ID
|
||||
- AWS_ECR_REGION
|
||||
- BENCHMARK_LARGE_OLTP_PROJECTID
|
||||
- SLACK_ON_CALL_DEVPROD_STREAM
|
||||
- SLACK_RUST_CHANNEL_ID
|
||||
|
||||
6
.github/workflows/_meta.yml
vendored
6
.github/workflows/_meta.yml
vendored
@@ -19,7 +19,7 @@ on:
|
||||
description: "Tag of the last compute release"
|
||||
value: ${{ jobs.tags.outputs.compute }}
|
||||
run-kind:
|
||||
description: "The kind of run we're currently in. Will be one of `push-main`, `storage-release`, `compute-release`, `proxy-release`, `storage-rc-pr`, `compute-rc-pr`, `proxy-rc-pr`, `pr`, or `workflow-dispatch`"
|
||||
description: "The kind of run we're currently in. Will be one of `pr`, `push-main`, `storage-rc`, `storage-release`, `proxy-rc`, `proxy-release`, `compute-rc`, `compute-release` or `merge_queue`"
|
||||
value: ${{ jobs.tags.outputs.run-kind }}
|
||||
|
||||
permissions: {}
|
||||
@@ -55,7 +55,6 @@ jobs:
|
||||
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-compute') && 'compute-rc-pr'
|
||||
|| (inputs.github-event-name == 'pull_request' && github.base_ref == 'release-proxy') && 'proxy-rc-pr'
|
||||
|| (inputs.github-event-name == 'pull_request') && 'pr'
|
||||
|| (inputs.github-event-name == 'workflow_dispatch') && 'workflow-dispatch'
|
||||
|| 'unknown'
|
||||
}}
|
||||
run: |
|
||||
@@ -86,9 +85,6 @@ jobs:
|
||||
BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')
|
||||
echo "tag=$BUILD_AND_TEST_RUN_ID" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
workflow-dispatch)
|
||||
echo "tag=$GITHUB_RUN_ID" | tee -a $GITHUB_OUTPUT
|
||||
;;
|
||||
*)
|
||||
echo "Unexpected RUN_KIND ('${RUN_KIND}'), failing to assign build-tag!"
|
||||
exit 1
|
||||
|
||||
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -1231,7 +1231,7 @@ jobs:
|
||||
payload: |
|
||||
channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
|
||||
text: |
|
||||
🔴 <!subteam^S06CJ87UMNY|@oncall-storage>: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
|
||||
|
||||
# The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
|
||||
promote-compatibility-data:
|
||||
|
||||
7
.github/workflows/cargo-deny.yml
vendored
7
.github/workflows/cargo-deny.yml
vendored
@@ -7,7 +7,7 @@ on:
|
||||
required: false
|
||||
type: string
|
||||
schedule:
|
||||
- cron: '0 10 * * *'
|
||||
- cron: '0 0 * * *'
|
||||
|
||||
jobs:
|
||||
cargo-deny:
|
||||
@@ -50,9 +50,8 @@ jobs:
|
||||
method: chat.postMessage
|
||||
token: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
payload: |
|
||||
channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
|
||||
channel: ${{ vars.SLACK_CICD_CHANNEL_ID }}
|
||||
text: |
|
||||
Periodic cargo-deny on ${{ matrix.ref }}: ${{ job.status }}
|
||||
<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
|
||||
Fixing the problem should be fairly straight forward from the logs. If not, <#${{ vars.SLACK_RUST_CHANNEL_ID }}> is there to help.
|
||||
Pinging <!subteam^S0838JPSH32|@oncall-devprod>.
|
||||
Pinging @oncall-devprod.
|
||||
|
||||
12
.github/workflows/periodic_pagebench.yml
vendored
12
.github/workflows/periodic_pagebench.yml
vendored
@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 */3 * * *' # Runs every 3 hours
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 18 * * *' # Runs at 6 PM UTC every day
|
||||
workflow_dispatch: # Allows manual triggering of the workflow
|
||||
inputs:
|
||||
commit_hash:
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# Autoscaling
|
||||
/libs/vm_monitor/ @neondatabase/autoscaling
|
||||
|
||||
# DevProd & PerfCorr
|
||||
/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
|
||||
# DevProd
|
||||
/.github/ @neondatabase/developer-productivity
|
||||
|
||||
# Compute
|
||||
/pgxn/ @neondatabase/compute
|
||||
|
||||
139
Cargo.lock
generated
139
Cargo.lock
generated
@@ -1127,9 +1127,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.16"
|
||||
version = "1.1.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
|
||||
checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
|
||||
dependencies = [
|
||||
"jobserver",
|
||||
"libc",
|
||||
@@ -2848,7 +2848,6 @@ dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"fail",
|
||||
"futures",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"jemalloc_pprof",
|
||||
@@ -2862,7 +2861,6 @@ dependencies = [
|
||||
"serde_path_to_error",
|
||||
"thiserror 1.0.69",
|
||||
"tokio",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
@@ -4064,7 +4062,7 @@ dependencies = [
|
||||
"opentelemetry-http",
|
||||
"opentelemetry-proto",
|
||||
"opentelemetry_sdk",
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
"reqwest",
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
@@ -4077,7 +4075,7 @@ checksum = "a6e05acbfada5ec79023c85368af14abd0b307c015e9064d249b2a950ef459a6"
|
||||
dependencies = [
|
||||
"opentelemetry",
|
||||
"opentelemetry_sdk",
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
"tonic",
|
||||
]
|
||||
|
||||
@@ -4191,7 +4189,6 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
"rand 0.8.5",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
@@ -4281,9 +4278,6 @@ dependencies = [
|
||||
"remote_storage",
|
||||
"reqwest",
|
||||
"rpds",
|
||||
"rustls 0.23.18",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"scopeguard",
|
||||
"send-future",
|
||||
"serde",
|
||||
@@ -4302,15 +4296,14 @@ dependencies = [
|
||||
"tokio-epoll-uring",
|
||||
"tokio-io-timeout",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.26.0",
|
||||
"tokio-stream",
|
||||
"tokio-tar",
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"tracing-utils",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"wal_decoder",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
@@ -4345,6 +4338,7 @@ dependencies = [
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"thiserror 1.0.69",
|
||||
"tracing-utils",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -4837,10 +4831,8 @@ dependencies = [
|
||||
"nix 0.26.4",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"prost 0.12.6",
|
||||
"prost-build 0.12.6",
|
||||
"prost-derive 0.12.6",
|
||||
"sha2",
|
||||
"protobuf",
|
||||
"protobuf-codegen-pure",
|
||||
"smallvec",
|
||||
"symbolic-demangle",
|
||||
"tempfile",
|
||||
@@ -4859,7 +4851,7 @@ dependencies = [
|
||||
"inferno 0.12.0",
|
||||
"num",
|
||||
"paste",
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4952,16 +4944,6 @@ dependencies = [
|
||||
"thiserror 1.0.69",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive 0.12.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost"
|
||||
version = "0.13.3"
|
||||
@@ -4969,28 +4951,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"prost-derive 0.13.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-build"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck",
|
||||
"itertools 0.10.5",
|
||||
"log",
|
||||
"multimap",
|
||||
"once_cell",
|
||||
"petgraph",
|
||||
"prettyplease",
|
||||
"prost 0.12.6",
|
||||
"prost-types 0.12.6",
|
||||
"regex",
|
||||
"syn 2.0.90",
|
||||
"tempfile",
|
||||
"prost-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5007,26 +4968,13 @@ dependencies = [
|
||||
"once_cell",
|
||||
"petgraph",
|
||||
"prettyplease",
|
||||
"prost 0.13.3",
|
||||
"prost-types 0.13.3",
|
||||
"prost",
|
||||
"prost-types",
|
||||
"regex",
|
||||
"syn 2.0.90",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools 0.10.5",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-derive"
|
||||
version = "0.13.3"
|
||||
@@ -5040,22 +4988,38 @@ dependencies = [
|
||||
"syn 2.0.90",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.12.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0"
|
||||
dependencies = [
|
||||
"prost 0.12.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "prost-types"
|
||||
version = "0.13.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670"
|
||||
dependencies = [
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf"
|
||||
version = "2.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-codegen"
|
||||
version = "2.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6"
|
||||
dependencies = [
|
||||
"protobuf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "protobuf-codegen-pure"
|
||||
version = "2.28.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865"
|
||||
dependencies = [
|
||||
"protobuf",
|
||||
"protobuf-codegen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5664,16 +5628,16 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.13"
|
||||
version = "0.17.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
|
||||
checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"getrandom 0.2.11",
|
||||
"libc",
|
||||
"spin",
|
||||
"untrusted",
|
||||
"windows-sys 0.52.0",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -5915,9 +5879,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pki-types"
|
||||
version = "1.11.0"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c"
|
||||
checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
@@ -6603,7 +6567,7 @@ dependencies = [
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
"rustls 0.23.18",
|
||||
"tokio",
|
||||
"tonic",
|
||||
@@ -7375,7 +7339,7 @@ dependencies = [
|
||||
"hyper-util",
|
||||
"percent-encoding",
|
||||
"pin-project",
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"tokio",
|
||||
@@ -7395,8 +7359,8 @@ checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11"
|
||||
dependencies = [
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost-build 0.13.3",
|
||||
"prost-types 0.13.3",
|
||||
"prost-build",
|
||||
"prost-types",
|
||||
"quote",
|
||||
"syn 2.0.90",
|
||||
]
|
||||
@@ -7604,6 +7568,7 @@ dependencies = [
|
||||
"opentelemetry-otlp",
|
||||
"opentelemetry-semantic-conventions",
|
||||
"opentelemetry_sdk",
|
||||
"pin-project-lite",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-opentelemetry",
|
||||
@@ -7844,6 +7809,7 @@ dependencies = [
|
||||
"tracing",
|
||||
"tracing-error",
|
||||
"tracing-subscriber",
|
||||
"tracing-utils",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
@@ -7929,7 +7895,7 @@ dependencies = [
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"pprof",
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
"remote_storage",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -8386,7 +8352,6 @@ dependencies = [
|
||||
"hyper-util",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.0.1",
|
||||
"itertools 0.10.5",
|
||||
"itertools 0.12.1",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
@@ -8405,7 +8370,7 @@ dependencies = [
|
||||
"parquet",
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"prost 0.13.3",
|
||||
"prost",
|
||||
"quote",
|
||||
"rand 0.8.5",
|
||||
"regex",
|
||||
|
||||
@@ -139,7 +139,7 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
|
||||
parquet_derive = "53"
|
||||
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
|
||||
pin-project-lite = "0.2"
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
|
||||
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "protobuf", "protobuf-codec"] }
|
||||
procfs = "0.16"
|
||||
prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
|
||||
prost = "0.13"
|
||||
@@ -155,7 +155,6 @@ rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = { version = "0.23.16", default-features = false }
|
||||
rustls-pemfile = "2"
|
||||
rustls-pki-types = "1.11"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
sd-notify = "0.4.1"
|
||||
|
||||
@@ -1980,10 +1980,12 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
# rsyslog config permissions
|
||||
# directory for rsyslogd pid file
|
||||
RUN mkdir /var/run/rsyslogd && \
|
||||
chown -R postgres:postgres /var/run/rsyslogd && \
|
||||
chown -R postgres:postgres /etc/rsyslog.d/
|
||||
RUN chown postgres:postgres /etc/rsyslog.conf && \
|
||||
touch /etc/compute_rsyslog.conf && \
|
||||
chown -R postgres:postgres /etc/compute_rsyslog.conf && \
|
||||
# directory for rsyslogd pid file
|
||||
mkdir /var/run/rsyslogd && \
|
||||
chown -R postgres:postgres /var/run/rsyslogd
|
||||
|
||||
|
||||
ENV LANG=en_US.utf8
|
||||
|
||||
@@ -1,5 +1 @@
|
||||
SELECT sum(pg_database_size(datname)) AS total
|
||||
FROM pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2;
|
||||
SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
|
||||
|
||||
@@ -1,20 +1,10 @@
|
||||
-- We export stats for 10 non-system databases. Without this limit it is too
|
||||
-- easy to abuse the system by creating lots of databases.
|
||||
|
||||
SELECT pg_database_size(datname) AS db_size,
|
||||
deadlocks,
|
||||
tup_inserted AS inserted,
|
||||
tup_updated AS updated,
|
||||
tup_deleted AS deleted,
|
||||
datname
|
||||
SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
|
||||
tup_updated AS updated, tup_deleted AS deleted, datname
|
||||
FROM pg_stat_database
|
||||
WHERE datname IN (
|
||||
SELECT datname FROM pg_database
|
||||
-- Ignore invalid databases, as we will likely have problems with
|
||||
-- getting their size from the Pageserver.
|
||||
WHERE datconnlimit != -2
|
||||
AND datname <> 'postgres'
|
||||
AND NOT datistemplate
|
||||
ORDER BY oid
|
||||
LIMIT 10
|
||||
WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
|
||||
);
|
||||
|
||||
@@ -39,10 +39,6 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
- name: rsyslogd
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
@@ -73,12 +69,6 @@ files:
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
|
||||
# compute_ctl will rewrite this file with the actual configuration, if needed.
|
||||
- filename: compute_rsyslog.conf
|
||||
content: |
|
||||
*.* /dev/null
|
||||
$IncludeConfig /etc/rsyslog.d/*.conf
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
@@ -142,12 +132,6 @@ merge: |
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
@@ -39,10 +39,6 @@ commands:
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
|
||||
- name: rsyslogd
|
||||
user: postgres
|
||||
sysvInitAction: respawn
|
||||
shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
|
||||
shutdownHook: |
|
||||
su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
|
||||
files:
|
||||
@@ -73,12 +69,6 @@ files:
|
||||
}
|
||||
memory {}
|
||||
}
|
||||
# Create dummy rsyslog config, because it refuses to start without at least one action configured.
|
||||
# compute_ctl will rewrite this file with the actual configuration, if needed.
|
||||
- filename: compute_rsyslog.conf
|
||||
content: |
|
||||
*.* /dev/null
|
||||
$IncludeConfig /etc/rsyslog.d/*.conf
|
||||
build: |
|
||||
# Build cgroup-tools
|
||||
#
|
||||
@@ -138,11 +128,6 @@ merge: |
|
||||
RUN set -e \
|
||||
&& chmod 0644 /etc/cgconfig.conf
|
||||
|
||||
COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /etc/compute_rsyslog.conf
|
||||
RUN chmod 0666 /var/log/
|
||||
|
||||
|
||||
COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/
|
||||
COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
|
||||
|
||||
@@ -592,6 +592,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Json,
|
||||
utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ use crate::logger::startup_context_from_env;
|
||||
use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
|
||||
use crate::monitor::launch_monitor;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::rsyslog::configure_audit_rsyslog;
|
||||
use crate::rsyslog::configure_and_start_rsyslog;
|
||||
use crate::spec::*;
|
||||
use crate::swap::resize_swap;
|
||||
use crate::sync_sk::{check_if_synced, ping_safekeeper};
|
||||
@@ -297,6 +297,79 @@ struct StartVmMonitorResult {
|
||||
vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
|
||||
}
|
||||
|
||||
pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
|
||||
let roles = spec
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| escape_literal(&r.name))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let dbs = spec
|
||||
.cluster
|
||||
.databases
|
||||
.iter()
|
||||
.map(|db| escape_literal(&db.name))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let roles_decl = if roles.is_empty() {
|
||||
String::from("roles text[] := NULL;")
|
||||
} else {
|
||||
format!(
|
||||
r#"
|
||||
roles text[] := ARRAY(SELECT rolname
|
||||
FROM pg_catalog.pg_roles
|
||||
WHERE rolname IN ({}));"#,
|
||||
roles.join(", ")
|
||||
)
|
||||
};
|
||||
|
||||
let database_decl = if dbs.is_empty() {
|
||||
String::from("dbs text[] := NULL;")
|
||||
} else {
|
||||
format!(
|
||||
r#"
|
||||
dbs text[] := ARRAY(SELECT datname
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname IN ({}));"#,
|
||||
dbs.join(", ")
|
||||
)
|
||||
};
|
||||
|
||||
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
|
||||
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
|
||||
let query = format!(
|
||||
r#"
|
||||
DO $$
|
||||
DECLARE
|
||||
r text;
|
||||
{}
|
||||
{}
|
||||
BEGIN
|
||||
IF NOT EXISTS (
|
||||
SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
IF array_length(roles, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT neon_superuser TO %s',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
|
||||
FOREACH r IN ARRAY roles LOOP
|
||||
EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
|
||||
END LOOP;
|
||||
END IF;
|
||||
IF array_length(dbs, 1) IS NOT NULL THEN
|
||||
EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
|
||||
array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
$$;"#,
|
||||
roles_decl, database_decl,
|
||||
);
|
||||
|
||||
query
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn new(
|
||||
params: ComputeNodeParams,
|
||||
@@ -616,7 +689,7 @@ impl ComputeNode {
|
||||
let log_directory_path = Path::new(&self.params.pgdata).join("log");
|
||||
// TODO: make this more robust
|
||||
// now rsyslog starts once and there is no monitoring or restart if it fails
|
||||
configure_audit_rsyslog(
|
||||
configure_and_start_rsyslog(
|
||||
log_directory_path.to_str().unwrap(),
|
||||
"hipaa",
|
||||
&remote_endpoint,
|
||||
|
||||
@@ -202,24 +202,8 @@ pub async fn download_extension(
|
||||
// move contents of the libdir / sharedir in unzipped archive to the correct local paths
|
||||
for paths in [sharedir_paths, libdir_paths] {
|
||||
let (zip_dir, real_dir) = paths;
|
||||
|
||||
let dir = match std::fs::read_dir(&zip_dir) {
|
||||
Ok(dir) => dir,
|
||||
Err(e) => match e.kind() {
|
||||
// In the event of a SQL-only extension, there would be nothing
|
||||
// to move from the lib/ directory, so note that in the log and
|
||||
// move on.
|
||||
std::io::ErrorKind::NotFound => {
|
||||
info!("nothing to move from {}", zip_dir);
|
||||
continue;
|
||||
}
|
||||
_ => return Err(anyhow::anyhow!(e)),
|
||||
},
|
||||
};
|
||||
|
||||
info!("mv {zip_dir:?}/* {real_dir:?}");
|
||||
|
||||
for file in dir {
|
||||
for file in std::fs::read_dir(zip_dir)? {
|
||||
let old_file = file?.path();
|
||||
let new_file =
|
||||
Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?);
|
||||
|
||||
@@ -24,7 +24,8 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
|
||||
.with_writer(std::io::stderr);
|
||||
|
||||
// Initialize OpenTelemetry
|
||||
let otlp_layer = tracing_utils::init_tracing("compute_ctl").await;
|
||||
let otlp_layer =
|
||||
tracing_utils::init_tracing("compute_ctl", tracing_utils::ExportConfig::default()).await;
|
||||
|
||||
// Put it all together
|
||||
tracing_subscriber::registry()
|
||||
|
||||
@@ -186,40 +186,15 @@ impl DatabaseExt for Database {
|
||||
/// Postgres SQL queries and DATABASE_URL.
|
||||
pub trait Escaping {
|
||||
fn pg_quote(&self) -> String;
|
||||
fn pg_quote_dollar(&self) -> (String, String);
|
||||
}
|
||||
|
||||
impl Escaping for PgIdent {
|
||||
/// This is intended to mimic Postgres quote_ident(), but for simplicity it
|
||||
/// always quotes provided string with `""` and escapes every `"`.
|
||||
/// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
|
||||
/// N.B. it's not useful for escaping identifiers that are used inside WHERE
|
||||
/// clause, use `escape_literal()` instead.
|
||||
fn pg_quote(&self) -> String {
|
||||
format!("\"{}\"", self.replace('"', "\"\""))
|
||||
}
|
||||
|
||||
/// This helper is intended to be used for dollar-escaping strings for usage
|
||||
/// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
|
||||
/// it also returns a tag that is intended to be used inside the outer
|
||||
/// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
|
||||
/// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
|
||||
/// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
|
||||
fn pg_quote_dollar(&self) -> (String, String) {
|
||||
let mut tag: String = "".to_string();
|
||||
let mut outer_tag = "x".to_string();
|
||||
|
||||
// Find the first suitable tag that is not present in the string.
|
||||
// Postgres' max role/DB name length is 63 bytes, so even in the
|
||||
// worst case it won't take long.
|
||||
while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
|
||||
tag += "x";
|
||||
outer_tag = tag.clone() + "x";
|
||||
}
|
||||
|
||||
let escaped = format!("${tag}${self}${tag}$");
|
||||
|
||||
(escaped, outer_tag)
|
||||
let result = format!("\"{}\"", self.replace('"', "\"\""));
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,13 +226,10 @@ pub async fn get_existing_dbs_async(
|
||||
// invalid state. See:
|
||||
// https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
|
||||
let rowstream = client
|
||||
// We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
|
||||
// because the latter automatically wraps the result in double quotes,
|
||||
// if the role name contains special characters.
|
||||
.query_raw::<str, &String, &[String; 0]>(
|
||||
"SELECT
|
||||
datname AS name,
|
||||
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
|
||||
datdba::regrole::text AS owner,
|
||||
NOT datallowconn AS restrict_conn,
|
||||
datconnlimit = - 2 AS invalid
|
||||
FROM
|
||||
|
||||
@@ -21,34 +21,40 @@ fn get_rsyslog_pid() -> Option<String> {
|
||||
}
|
||||
}
|
||||
|
||||
// Restart rsyslogd to apply the new configuration.
|
||||
// This is necessary, because there is no other way to reload the rsyslog configuration.
|
||||
//
|
||||
// Rsyslogd shouldn't lose any messages, because of the restart,
|
||||
// because it tracks the last read position in the log files
|
||||
// and will continue reading from that position.
|
||||
// TODO: test it properly
|
||||
//
|
||||
fn restart_rsyslog() -> Result<()> {
|
||||
let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
|
||||
info!("rsyslogd is running with pid: {}, restart it", old_pid);
|
||||
// Start rsyslogd with the specified configuration file
|
||||
// If it is already running, do nothing.
|
||||
fn start_rsyslog(rsyslog_conf_path: &str) -> Result<()> {
|
||||
let pid = get_rsyslog_pid();
|
||||
if let Some(pid) = pid {
|
||||
info!("rsyslogd is already running with pid: {}", pid);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// kill it to restart
|
||||
let _ = Command::new("pkill")
|
||||
.arg("rsyslogd")
|
||||
let _ = Command::new("/usr/sbin/rsyslogd")
|
||||
.arg("-f")
|
||||
.arg(rsyslog_conf_path)
|
||||
.arg("-i")
|
||||
.arg("/var/run/rsyslogd/rsyslogd.pid")
|
||||
.output()
|
||||
.context("Failed to stop rsyslogd")?;
|
||||
.context("Failed to start rsyslogd")?;
|
||||
|
||||
// Check that rsyslogd is running
|
||||
if let Some(pid) = get_rsyslog_pid() {
|
||||
info!("rsyslogd started successfully with pid: {}", pid);
|
||||
} else {
|
||||
return Err(anyhow::anyhow!("Failed to start rsyslogd"));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn configure_audit_rsyslog(
|
||||
pub fn configure_and_start_rsyslog(
|
||||
log_directory: &str,
|
||||
tag: &str,
|
||||
remote_endpoint: &str,
|
||||
) -> Result<()> {
|
||||
let config_content: String = format!(
|
||||
include_str!("config_template/compute_audit_rsyslog_template.conf"),
|
||||
include_str!("config_template/compute_rsyslog_template.conf"),
|
||||
log_directory = log_directory,
|
||||
tag = tag,
|
||||
remote_endpoint = remote_endpoint
|
||||
@@ -56,7 +62,7 @@ pub fn configure_audit_rsyslog(
|
||||
|
||||
info!("rsyslog config_content: {}", config_content);
|
||||
|
||||
let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
|
||||
let rsyslog_conf_path = "/etc/compute_rsyslog.conf";
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.write(true)
|
||||
@@ -65,13 +71,10 @@ pub fn configure_audit_rsyslog(
|
||||
|
||||
file.write_all(config_content.as_bytes())?;
|
||||
|
||||
info!(
|
||||
"rsyslog configuration file {} added successfully. Starting rsyslogd",
|
||||
rsyslog_conf_path
|
||||
);
|
||||
info!("rsyslog configuration added successfully. Starting rsyslogd");
|
||||
|
||||
// start the service, using the configuration
|
||||
restart_rsyslog()?;
|
||||
start_rsyslog(rsyslog_conf_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -13,17 +13,16 @@ use tokio_postgres::Client;
|
||||
use tokio_postgres::error::SqlState;
|
||||
use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeState};
|
||||
use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
|
||||
use crate::pg_helpers::{
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
|
||||
DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
|
||||
get_existing_roles_async,
|
||||
};
|
||||
use crate::spec_apply::ApplySpecPhase::{
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
|
||||
CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
|
||||
DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
|
||||
HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
|
||||
RunInEachDatabase,
|
||||
CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
|
||||
CreatePgauditlogtofileExtension, CreateSchemaNeon, CreateSuperUser, DisablePostgresDBPgAudit,
|
||||
DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions, HandleNeonExtension,
|
||||
HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles, RunInEachDatabase,
|
||||
};
|
||||
use crate::spec_apply::PerDatabasePhase::{
|
||||
ChangeSchemaPerms, DeleteDBRoleReferences, DropLogicalSubscriptions, HandleAnonExtension,
|
||||
@@ -188,7 +187,7 @@ impl ComputeNode {
|
||||
}
|
||||
|
||||
for phase in [
|
||||
CreateNeonSuperuser,
|
||||
CreateSuperUser,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -469,7 +468,7 @@ pub enum PerDatabasePhase {
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ApplySpecPhase {
|
||||
CreateNeonSuperuser,
|
||||
CreateSuperUser,
|
||||
DropInvalidDatabases,
|
||||
RenameRoles,
|
||||
CreateAndAlterRoles,
|
||||
@@ -596,10 +595,14 @@ async fn get_operations<'a>(
|
||||
apply_spec_phase: &'a ApplySpecPhase,
|
||||
) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
|
||||
match apply_spec_phase {
|
||||
ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
|
||||
query: include_str!("sql/create_neon_superuser.sql").to_string(),
|
||||
comment: None,
|
||||
}))),
|
||||
ApplySpecPhase::CreateSuperUser => {
|
||||
let query = construct_superuser_query(spec);
|
||||
|
||||
Ok(Box::new(once(Operation {
|
||||
query,
|
||||
comment: None,
|
||||
})))
|
||||
}
|
||||
ApplySpecPhase::DropInvalidDatabases => {
|
||||
let mut ctx = ctx.write().await;
|
||||
let databases = &mut ctx.dbs;
|
||||
@@ -733,15 +736,14 @@ async fn get_operations<'a>(
|
||||
// We do not check whether the DB exists or not,
|
||||
// Postgres will take care of it for us
|
||||
"delete_db" => {
|
||||
let (db_name, outer_tag) = op.name.pg_quote_dollar();
|
||||
// In Postgres we can't drop a database if it is a template.
|
||||
// So we need to unset the template flag first, but it could
|
||||
// be a retry, so we could've already dropped the database.
|
||||
// Check that database exists first to make it idempotent.
|
||||
let unset_template_query: String = format!(
|
||||
include_str!("sql/unset_template_for_drop_dbs.sql"),
|
||||
datname = db_name,
|
||||
outer_tag = outer_tag,
|
||||
datname_str = escape_literal(&op.name),
|
||||
datname = &op.name.pg_quote()
|
||||
);
|
||||
|
||||
// Use FORCE to drop database even if there are active connections.
|
||||
@@ -848,8 +850,6 @@ async fn get_operations<'a>(
|
||||
comment: None,
|
||||
},
|
||||
Operation {
|
||||
// ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
|
||||
// (see https://www.postgresql.org/docs/current/ddl-priv.html)
|
||||
query: format!(
|
||||
"GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
|
||||
db.name.pg_quote()
|
||||
@@ -909,11 +909,9 @@ async fn get_operations<'a>(
|
||||
PerDatabasePhase::DropLogicalSubscriptions => {
|
||||
match &db {
|
||||
DB::UserDB(db) => {
|
||||
let (db_name, outer_tag) = db.name.pg_quote_dollar();
|
||||
let drop_subscription_query: String = format!(
|
||||
include_str!("sql/drop_subscriptions.sql"),
|
||||
datname_str = db_name,
|
||||
outer_tag = outer_tag,
|
||||
datname_str = escape_literal(&db.name),
|
||||
);
|
||||
|
||||
let operations = vec![Operation {
|
||||
@@ -952,7 +950,6 @@ async fn get_operations<'a>(
|
||||
DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
|
||||
DB::UserDB(db) => db.owner.pg_quote(),
|
||||
};
|
||||
let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
|
||||
|
||||
Some(vec![
|
||||
// This will reassign all dependent objects to the db owner
|
||||
@@ -967,9 +964,7 @@ async fn get_operations<'a>(
|
||||
Operation {
|
||||
query: format!(
|
||||
include_str!("sql/pre_drop_role_revoke_privileges.sql"),
|
||||
// N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
role_name = escaped_role,
|
||||
outer_tag = outer_tag,
|
||||
role_name = quoted,
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
@@ -994,14 +989,12 @@ async fn get_operations<'a>(
|
||||
DB::SystemDB => return Ok(Box::new(empty())),
|
||||
DB::UserDB(db) => db,
|
||||
};
|
||||
let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
|
||||
|
||||
let operations = vec![
|
||||
Operation {
|
||||
query: format!(
|
||||
include_str!("sql/set_public_schema_owner.sql"),
|
||||
db_owner = db_owner,
|
||||
outer_tag = outer_tag,
|
||||
db_owner = db.owner.pg_quote()
|
||||
),
|
||||
comment: None,
|
||||
},
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
|
||||
THEN
|
||||
CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
@@ -1,4 +1,4 @@
|
||||
DO ${outer_tag}$
|
||||
DO $$
|
||||
DECLARE
|
||||
subname TEXT;
|
||||
BEGIN
|
||||
@@ -9,4 +9,4 @@ BEGIN
|
||||
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
|
||||
END LOOP;
|
||||
END;
|
||||
${outer_tag}$;
|
||||
$$;
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
DO ${outer_tag}$
|
||||
SET SESSION ROLE neon_superuser;
|
||||
|
||||
DO $$
|
||||
DECLARE
|
||||
schema TEXT;
|
||||
grantor TEXT;
|
||||
revoke_query TEXT;
|
||||
BEGIN
|
||||
FOR schema IN
|
||||
@@ -14,25 +15,14 @@ BEGIN
|
||||
-- ii) it's easy to add more schemas to the list if needed.
|
||||
WHERE schema_name IN ('public')
|
||||
LOOP
|
||||
FOR grantor IN EXECUTE
|
||||
format(
|
||||
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
quote_literal({role_name})
|
||||
)
|
||||
LOOP
|
||||
EXECUTE format('SET LOCAL ROLE %I', grantor);
|
||||
revoke_query := format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
|
||||
schema
|
||||
);
|
||||
|
||||
revoke_query := format(
|
||||
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
|
||||
schema,
|
||||
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
|
||||
{role_name},
|
||||
grantor
|
||||
);
|
||||
|
||||
EXECUTE revoke_query;
|
||||
END LOOP;
|
||||
EXECUTE revoke_query;
|
||||
END LOOP;
|
||||
END;
|
||||
${outer_tag}$;
|
||||
$$;
|
||||
|
||||
RESET ROLE;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
DO ${outer_tag}$
|
||||
DO
|
||||
$$
|
||||
DECLARE
|
||||
schema_owner TEXT;
|
||||
BEGIN
|
||||
@@ -15,8 +16,8 @@ DO ${outer_tag}$
|
||||
|
||||
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
|
||||
THEN
|
||||
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
|
||||
ALTER SCHEMA public OWNER TO {db_owner};
|
||||
END IF;
|
||||
END IF;
|
||||
END
|
||||
${outer_tag}$;
|
||||
$$;
|
||||
@@ -1,12 +1,12 @@
|
||||
DO ${outer_tag}$
|
||||
DO $$
|
||||
BEGIN
|
||||
IF EXISTS(
|
||||
SELECT 1
|
||||
FROM pg_catalog.pg_database
|
||||
WHERE datname = {datname}
|
||||
WHERE datname = {datname_str}
|
||||
)
|
||||
THEN
|
||||
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
|
||||
ALTER DATABASE {datname} is_template false;
|
||||
END IF;
|
||||
END
|
||||
${outer_tag}$;
|
||||
$$;
|
||||
@@ -61,23 +61,6 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
|
||||
assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ident_pg_quote_dollar() {
|
||||
let test_cases = vec![
|
||||
("name", ("$$name$$", "x")),
|
||||
("name$$", ("$x$name$$$x$", "xx")),
|
||||
("name$$$", ("$x$name$$$$x$", "xx")),
|
||||
("name$$$$", ("$x$name$$$$$x$", "xx")),
|
||||
("name$x$", ("$xx$name$x$$xx$", "xxx")),
|
||||
];
|
||||
|
||||
for (input, expected) in test_cases {
|
||||
let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
|
||||
assert_eq!(escaped, expected.0);
|
||||
assert_eq!(tag, expected.1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn generic_options_search() {
|
||||
let generic_options: GenericOptions = Some(vec![
|
||||
|
||||
@@ -963,7 +963,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
id: pageserver_id,
|
||||
listen_pg_addr: format!("127.0.0.1:{pg_port}"),
|
||||
listen_http_addr: format!("127.0.0.1:{http_port}"),
|
||||
listen_https_addr: None,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
other: Default::default(),
|
||||
@@ -978,7 +977,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
|
||||
storage_controller: None,
|
||||
control_plane_compute_hook_api: None,
|
||||
generate_local_ssl_certs: false,
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -81,10 +81,6 @@ pub struct LocalEnv {
|
||||
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
|
||||
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
|
||||
pub branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
|
||||
/// Flag to generate SSL certificates for components that need it.
|
||||
/// Also generates root CA certificate that is used to sign all other certificates.
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
/// On-disk state stored in `.neon/config`.
|
||||
@@ -106,10 +102,6 @@ pub struct OnDiskConfig {
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_compute_hook_api: Option<Url>,
|
||||
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
|
||||
// Note: skip serializing because in compat tests old storage controller fails
|
||||
// to load new config file. May be removed after this field is in release branch.
|
||||
#[serde(skip_serializing_if = "std::ops::Not::not")]
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
|
||||
@@ -137,7 +129,6 @@ pub struct NeonLocalInitConf {
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
pub control_plane_api: Option<Url>,
|
||||
pub control_plane_compute_hook_api: Option<Option<Url>>,
|
||||
pub generate_local_ssl_certs: bool,
|
||||
}
|
||||
|
||||
/// Broker config for cluster internal communication.
|
||||
@@ -174,9 +165,6 @@ pub struct NeonStorageControllerConf {
|
||||
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub long_reconcile_threshold: Option<Duration>,
|
||||
|
||||
#[serde(default)]
|
||||
pub use_https_pageserver_api: bool,
|
||||
}
|
||||
|
||||
impl NeonStorageControllerConf {
|
||||
@@ -200,7 +188,6 @@ impl Default for NeonStorageControllerConf {
|
||||
max_secondary_lag_bytes: None,
|
||||
heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL,
|
||||
long_reconcile_threshold: None,
|
||||
use_https_pageserver_api: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -230,7 +217,6 @@ pub struct PageServerConf {
|
||||
pub id: NodeId,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_https_addr: Option<String>,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
pub no_sync: bool,
|
||||
@@ -242,7 +228,6 @@ impl Default for PageServerConf {
|
||||
id: NodeId(0),
|
||||
listen_pg_addr: String::new(),
|
||||
listen_http_addr: String::new(),
|
||||
listen_https_addr: None,
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
no_sync: false,
|
||||
@@ -258,7 +243,6 @@ pub struct NeonLocalInitPageserverConf {
|
||||
pub id: NodeId,
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_https_addr: Option<String>,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
|
||||
@@ -273,7 +257,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
id,
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
@@ -283,7 +266,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
id: *id,
|
||||
listen_pg_addr: listen_pg_addr.clone(),
|
||||
listen_http_addr: listen_http_addr.clone(),
|
||||
listen_https_addr: listen_https_addr.clone(),
|
||||
pg_auth_type: *pg_auth_type,
|
||||
http_auth_type: *http_auth_type,
|
||||
no_sync: *no_sync,
|
||||
@@ -428,41 +410,6 @@ impl LocalEnv {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
|
||||
if self.generate_local_ssl_certs {
|
||||
Some(self.base_data_dir.join("rootCA.crt"))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
|
||||
if self.generate_local_ssl_certs {
|
||||
Some(self.base_data_dir.join("rootCA.key"))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_ssl_ca_cert(&self) -> anyhow::Result<()> {
|
||||
let cert_path = self.ssl_ca_cert_path().unwrap();
|
||||
let key_path = self.ssl_ca_key_path().unwrap();
|
||||
if !fs::exists(cert_path.as_path())? {
|
||||
generate_ssl_ca_cert(cert_path.as_path(), key_path.as_path())?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn generate_ssl_cert(&self, cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
|
||||
self.generate_ssl_ca_cert()?;
|
||||
generate_ssl_cert(
|
||||
cert_path,
|
||||
key_path,
|
||||
self.ssl_ca_cert_path().unwrap().as_path(),
|
||||
self.ssl_ca_key_path().unwrap().as_path(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Inspect the base data directory and extract the instance id and instance directory path
|
||||
/// for all storage controller instances
|
||||
pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
|
||||
@@ -572,7 +519,6 @@ impl LocalEnv {
|
||||
control_plane_api,
|
||||
control_plane_compute_hook_api,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
} = on_disk_config;
|
||||
LocalEnv {
|
||||
base_data_dir: repopath.to_owned(),
|
||||
@@ -587,7 +533,6 @@ impl LocalEnv {
|
||||
control_plane_api: control_plane_api.unwrap(),
|
||||
control_plane_compute_hook_api,
|
||||
branch_name_mappings,
|
||||
generate_local_ssl_certs,
|
||||
}
|
||||
};
|
||||
|
||||
@@ -623,7 +568,6 @@ impl LocalEnv {
|
||||
struct PageserverConfigTomlSubset {
|
||||
listen_pg_addr: String,
|
||||
listen_http_addr: String,
|
||||
listen_https_addr: Option<String>,
|
||||
pg_auth_type: AuthType,
|
||||
http_auth_type: AuthType,
|
||||
#[serde(default)]
|
||||
@@ -648,7 +592,6 @@ impl LocalEnv {
|
||||
let PageserverConfigTomlSubset {
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
@@ -666,7 +609,6 @@ impl LocalEnv {
|
||||
},
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
@@ -694,7 +636,6 @@ impl LocalEnv {
|
||||
control_plane_api: Some(self.control_plane_api.clone()),
|
||||
control_plane_compute_hook_api: self.control_plane_compute_hook_api.clone(),
|
||||
branch_name_mappings: self.branch_name_mappings.clone(),
|
||||
generate_local_ssl_certs: self.generate_local_ssl_certs,
|
||||
},
|
||||
)
|
||||
}
|
||||
@@ -777,7 +718,6 @@ impl LocalEnv {
|
||||
safekeepers,
|
||||
control_plane_api,
|
||||
control_plane_compute_hook_api,
|
||||
generate_local_ssl_certs,
|
||||
} = conf;
|
||||
|
||||
// Find postgres binaries.
|
||||
@@ -826,13 +766,8 @@ impl LocalEnv {
|
||||
control_plane_api: control_plane_api.unwrap(),
|
||||
control_plane_compute_hook_api: control_plane_compute_hook_api.unwrap_or_default(),
|
||||
branch_name_mappings: Default::default(),
|
||||
generate_local_ssl_certs,
|
||||
};
|
||||
|
||||
if generate_local_ssl_certs {
|
||||
env.generate_ssl_ca_cert()?;
|
||||
}
|
||||
|
||||
// create endpoints dir
|
||||
fs::create_dir_all(env.endpoints_path())?;
|
||||
|
||||
@@ -916,80 +851,3 @@ fn generate_auth_keys(private_key_path: &Path, public_key_path: &Path) -> anyhow
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> {
|
||||
// openssl req -x509 -newkey rsa:2048 -nodes -subj "/CN=Neon Local CA" -days 36500 \
|
||||
// -out rootCA.crt -keyout rootCA.key
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args([
|
||||
"req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
|
||||
])
|
||||
.args(["-subj", "/CN=Neon Local CA"])
|
||||
.args(["-out", cert_path.to_str().unwrap()])
|
||||
.args(["-keyout", key_path.to_str().unwrap()])
|
||||
.output()
|
||||
.context("failed to generate CA certificate")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn generate_ssl_cert(
|
||||
cert_path: &Path,
|
||||
key_path: &Path,
|
||||
ca_cert_path: &Path,
|
||||
ca_key_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
// Generate Certificate Signing Request (CSR).
|
||||
let mut csr_path = cert_path.to_path_buf();
|
||||
csr_path.set_extension(".csr");
|
||||
|
||||
// openssl req -new -nodes -newkey rsa:2048 -keyout server.key -out server.csr \
|
||||
// -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args(["req", "-new", "-nodes"])
|
||||
.args(["-newkey", "rsa:2048"])
|
||||
.args(["-subj", "/CN=localhost"])
|
||||
.args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
|
||||
.args(["-keyout", key_path.to_str().unwrap()])
|
||||
.args(["-out", csr_path.to_str().unwrap()])
|
||||
.output()
|
||||
.context("failed to generate CSR")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// Sign CSR with CA key.
|
||||
//
|
||||
// openssl x509 -req -in server.csr -CA rootCA.crt -CAkey rootCA.key -CAcreateserial \
|
||||
// -out server.crt -days 36500 -copy_extensions copyall
|
||||
let keygen_output = Command::new("openssl")
|
||||
.args(["x509", "-req"])
|
||||
.args(["-in", csr_path.to_str().unwrap()])
|
||||
.args(["-CA", ca_cert_path.to_str().unwrap()])
|
||||
.args(["-CAkey", ca_key_path.to_str().unwrap()])
|
||||
.arg("-CAcreateserial")
|
||||
.args(["-out", cert_path.to_str().unwrap()])
|
||||
.args(["-days", "36500"])
|
||||
.args(["-copy_extensions", "copyall"])
|
||||
.output()
|
||||
.context("failed to sign CSR")?;
|
||||
if !keygen_output.status.success() {
|
||||
bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
// Remove CSR file as it's not needed anymore.
|
||||
fs::remove_file(csr_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -21,7 +21,6 @@ use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
use postgres_connection::{PgConnectionConfig, parse_host_port};
|
||||
use reqwest::Certificate;
|
||||
use utils::auth::{Claims, Scope};
|
||||
use utils::id::{NodeId, TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -50,29 +49,12 @@ impl PageServerNode {
|
||||
let (host, port) =
|
||||
parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
|
||||
let port = port.unwrap_or(5432);
|
||||
|
||||
let ssl_ca_cert = env.ssl_ca_cert_path().map(|ssl_ca_file| {
|
||||
let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist");
|
||||
Certificate::from_pem(&buf).expect("CA certificate should be valid")
|
||||
});
|
||||
|
||||
let endpoint = if env.storage_controller.use_https_pageserver_api {
|
||||
format!(
|
||||
"https://{}",
|
||||
conf.listen_https_addr.as_ref().expect(
|
||||
"listen https address should be specified if use_https_pageserver_api is on"
|
||||
)
|
||||
)
|
||||
} else {
|
||||
format!("http://{}", conf.listen_http_addr)
|
||||
};
|
||||
|
||||
Self {
|
||||
pg_connection_config: PgConnectionConfig::new_host_port(host, port),
|
||||
conf: conf.clone(),
|
||||
env: env.clone(),
|
||||
http_client: mgmt_api::Client::new(
|
||||
endpoint,
|
||||
format!("http://{}", conf.listen_http_addr),
|
||||
{
|
||||
match conf.http_auth_type {
|
||||
AuthType::Trust => None,
|
||||
@@ -83,9 +65,7 @@ impl PageServerNode {
|
||||
}
|
||||
}
|
||||
.as_deref(),
|
||||
ssl_ca_cert,
|
||||
)
|
||||
.expect("Client constructs with no errors"),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -240,13 +220,6 @@ impl PageServerNode {
|
||||
.context("write identity toml")?;
|
||||
drop(identity_toml);
|
||||
|
||||
if self.env.generate_local_ssl_certs {
|
||||
self.env.generate_ssl_cert(
|
||||
datadir.join("server.crt").as_path(),
|
||||
datadir.join("server.key").as_path(),
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
|
||||
|
||||
// Write metadata file, used by pageserver on startup to register itself with
|
||||
@@ -257,15 +230,6 @@ impl PageServerNode {
|
||||
parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr");
|
||||
let http_port = http_port.unwrap_or(9898);
|
||||
|
||||
let https_port = match self.conf.listen_https_addr.as_ref() {
|
||||
Some(https_addr) => {
|
||||
let (_https_host, https_port) =
|
||||
parse_host_port(https_addr).expect("Unable to parse listen_https_addr");
|
||||
Some(https_port.unwrap_or(9899))
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
// Intentionally hand-craft JSON: this acts as an implicit format compat test
|
||||
// in case the pageserver-side structure is edited, and reflects the real life
|
||||
// situation: the metadata is written by some other script.
|
||||
@@ -276,7 +240,6 @@ impl PageServerNode {
|
||||
postgres_port: self.pg_connection_config.port(),
|
||||
http_host: "localhost".to_string(),
|
||||
http_port,
|
||||
https_port,
|
||||
other: HashMap::from([(
|
||||
"availability_zone_id".to_string(),
|
||||
serde_json::json!(az_id),
|
||||
|
||||
@@ -12,10 +12,13 @@ use hyper0::Uri;
|
||||
use nix::unistd::Pid;
|
||||
use pageserver_api::controller_api::{
|
||||
NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
|
||||
TenantCreateResponse, TenantLocateResponse,
|
||||
TenantCreateResponse, TenantLocateResponse, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
};
|
||||
use pageserver_api::models::{TimelineCreateRequest, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::models::{
|
||||
TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use pageserver_api::shard::{ShardStripeSize, TenantShardId};
|
||||
use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
@@ -534,14 +537,6 @@ impl StorageController {
|
||||
args.push("--start-as-candidate".to_string());
|
||||
}
|
||||
|
||||
if self.config.use_https_pageserver_api {
|
||||
args.push("--use-https-pageserver-api".to_string());
|
||||
}
|
||||
|
||||
if let Some(ssl_ca_file) = self.env.ssl_ca_cert_path() {
|
||||
args.push(format!("--ssl-ca-file={}", ssl_ca_file.to_str().unwrap()));
|
||||
}
|
||||
|
||||
if let Some(private_key) = &self.private_key {
|
||||
let claims = Claims::new(None, Scope::PageServerApi);
|
||||
let jwt_token =
|
||||
@@ -830,6 +825,41 @@ impl StorageController {
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self))]
|
||||
pub async fn tenant_migrate(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
node_id: NodeId,
|
||||
) -> anyhow::Result<TenantShardMigrateResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(TenantShardMigrateRequest {
|
||||
node_id,
|
||||
migration_config: None,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip(self), fields(%tenant_id, %new_shard_count))]
|
||||
pub async fn tenant_split(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
new_shard_count: u8,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
) -> anyhow::Result<TenantShardSplitResponse> {
|
||||
self.dispatch(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_id}/shard_split"),
|
||||
Some(TenantShardSplitRequest {
|
||||
new_shard_count,
|
||||
new_stripe_size,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(node_id=%req.node_id))]
|
||||
pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
|
||||
self.dispatch::<_, ()>(Method::POST, "control/v1/node".to_string(), Some(req))
|
||||
|
||||
@@ -1,17 +1,16 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::controller_api::{
|
||||
AvailabilityZone, MigrationConfig, NodeAvailabilityWrapper, NodeConfigureRequest,
|
||||
NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse,
|
||||
PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
|
||||
ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
|
||||
SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
|
||||
TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
AvailabilityZone, NodeAvailabilityWrapper, NodeConfigureRequest, NodeDescribeResponse,
|
||||
NodeRegisterRequest, NodeSchedulingPolicy, NodeShardResponse, PlacementPolicy,
|
||||
SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
|
||||
ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest,
|
||||
TenantDescribeResponse, TenantPolicyRequest, TenantShardMigrateRequest,
|
||||
TenantShardMigrateResponse,
|
||||
};
|
||||
use pageserver_api::models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, ShardParameters,
|
||||
@@ -113,15 +112,6 @@ enum Command {
|
||||
tenant_shard_id: TenantShardId,
|
||||
#[arg(long)]
|
||||
node: NodeId,
|
||||
#[arg(long, default_value_t = true, action = clap::ArgAction::Set)]
|
||||
prewarm: bool,
|
||||
#[arg(long, default_value_t = false, action = clap::ArgAction::Set)]
|
||||
override_scheduler: bool,
|
||||
},
|
||||
/// Watch the location of a tenant shard evolve, e.g. while expecting it to migrate
|
||||
TenantShardWatch {
|
||||
#[arg(long)]
|
||||
tenant_shard_id: TenantShardId,
|
||||
},
|
||||
/// Migrate the secondary location for a tenant shard to a specific pageserver.
|
||||
TenantShardMigrateSecondary {
|
||||
@@ -279,10 +269,6 @@ struct Cli {
|
||||
/// a token with both scopes to use with this tool.
|
||||
jwt: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
/// Trusted root CA certificate to use in https APIs.
|
||||
ssl_ca_file: Option<PathBuf>,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
@@ -393,17 +379,9 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
|
||||
|
||||
let ssl_ca_cert = match &cli.ssl_ca_file {
|
||||
Some(ssl_ca_file) => {
|
||||
let buf = tokio::fs::read(ssl_ca_file).await?;
|
||||
Some(reqwest::Certificate::from_pem(&buf)?)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let mut trimmed = cli.api.to_string();
|
||||
trimmed.pop();
|
||||
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref(), ssl_ca_cert)?;
|
||||
let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
|
||||
|
||||
match cli.command {
|
||||
Command::NodeRegister {
|
||||
@@ -641,43 +619,19 @@ async fn main() -> anyhow::Result<()> {
|
||||
Command::TenantShardMigrate {
|
||||
tenant_shard_id,
|
||||
node,
|
||||
prewarm,
|
||||
override_scheduler,
|
||||
} => {
|
||||
let migration_config = MigrationConfig {
|
||||
prewarm,
|
||||
override_scheduler,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let req = TenantShardMigrateRequest {
|
||||
node_id: node,
|
||||
origin_node_id: None,
|
||||
migration_config,
|
||||
migration_config: None,
|
||||
};
|
||||
|
||||
match storcon_client
|
||||
storcon_client
|
||||
.dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
|
||||
Method::PUT,
|
||||
format!("control/v1/tenant/{tenant_shard_id}/migrate"),
|
||||
Some(req),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) => {
|
||||
anyhow::bail!(
|
||||
"Migration to {node} rejected, may require `--force` ({}) ",
|
||||
msg
|
||||
);
|
||||
}
|
||||
Err(e) => return Err(e.into()),
|
||||
Ok(_) => {}
|
||||
}
|
||||
|
||||
watch_tenant_shard(storcon_client, tenant_shard_id, Some(node)).await?;
|
||||
}
|
||||
Command::TenantShardWatch { tenant_shard_id } => {
|
||||
watch_tenant_shard(storcon_client, tenant_shard_id, None).await?;
|
||||
.await?;
|
||||
}
|
||||
Command::TenantShardMigrateSecondary {
|
||||
tenant_shard_id,
|
||||
@@ -685,8 +639,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
} => {
|
||||
let req = TenantShardMigrateRequest {
|
||||
node_id: node,
|
||||
origin_node_id: None,
|
||||
migration_config: MigrationConfig::default(),
|
||||
migration_config: None,
|
||||
};
|
||||
|
||||
storcon_client
|
||||
@@ -1152,8 +1105,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
|
||||
Some(TenantShardMigrateRequest {
|
||||
node_id: mv.to,
|
||||
origin_node_id: Some(mv.from),
|
||||
migration_config: MigrationConfig::default(),
|
||||
migration_config: None,
|
||||
}),
|
||||
)
|
||||
.await
|
||||
@@ -1332,68 +1284,3 @@ async fn main() -> anyhow::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
static WATCH_INTERVAL: Duration = Duration::from_secs(5);
|
||||
|
||||
async fn watch_tenant_shard(
|
||||
storcon_client: Client,
|
||||
tenant_shard_id: TenantShardId,
|
||||
until_migrated_to: Option<NodeId>,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(until_migrated_to) = until_migrated_to {
|
||||
println!(
|
||||
"Waiting for tenant shard {} to be migrated to node {}",
|
||||
tenant_shard_id, until_migrated_to
|
||||
);
|
||||
}
|
||||
|
||||
loop {
|
||||
let desc = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{}", tenant_shard_id.tenant_id),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
// Output the current state of the tenant shard
|
||||
let shard = desc
|
||||
.shards
|
||||
.iter()
|
||||
.find(|s| s.tenant_shard_id == tenant_shard_id)
|
||||
.ok_or(anyhow::anyhow!("Tenant shard not found"))?;
|
||||
let summary = format!(
|
||||
"attached: {} secondary: {} {}",
|
||||
shard
|
||||
.node_attached
|
||||
.map(|n| format!("{}", n))
|
||||
.unwrap_or("none".to_string()),
|
||||
shard
|
||||
.node_secondary
|
||||
.iter()
|
||||
.map(|n| n.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(","),
|
||||
if shard.is_reconciling {
|
||||
"(reconciler active)"
|
||||
} else {
|
||||
"(reconciler idle)"
|
||||
}
|
||||
);
|
||||
println!("{}", summary);
|
||||
|
||||
// Maybe drop out if we finished migration
|
||||
if let Some(until_migrated_to) = until_migrated_to {
|
||||
if shard.node_attached == Some(until_migrated_to) && !shard.is_reconciling {
|
||||
println!(
|
||||
"Tenant shard {} is now on node {}",
|
||||
tenant_shard_id, until_migrated_to
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
tokio::time::sleep(WATCH_INTERVAL).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -27,10 +27,6 @@ yanked = "warn"
|
||||
id = "RUSTSEC-2023-0071"
|
||||
reason = "the marvin attack only affects private key decryption, not public key signature verification"
|
||||
|
||||
[[advisories.ignore]]
|
||||
id = "RUSTSEC-2024-0436"
|
||||
reason = "The paste crate is a build-only dependency with no runtime components. It is unlikely to have any security impact."
|
||||
|
||||
# This section is considered when running `cargo deny check licenses`
|
||||
# More documentation for the licenses section can be found here:
|
||||
# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html
|
||||
|
||||
@@ -158,6 +158,7 @@ mod reliable_copy_test {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
|
||||
@@ -8,7 +8,6 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
fail.workspace = true
|
||||
futures.workspace = true
|
||||
hyper0.workspace = true
|
||||
itertools.workspace = true
|
||||
jemalloc_pprof.workspace = true
|
||||
@@ -22,7 +21,6 @@ serde_path_to_error.workspace = true
|
||||
thiserror.workspace = true
|
||||
tracing.workspace = true
|
||||
tokio.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-util.workspace = true
|
||||
url.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
@@ -399,10 +399,12 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
|
||||
// Return the report in the requested format.
|
||||
match format {
|
||||
Format::Pprof => {
|
||||
let body = report
|
||||
let mut body = Vec::new();
|
||||
report
|
||||
.pprof()
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?
|
||||
.encode_to_vec();
|
||||
.write_to_vec(&mut body)
|
||||
.map_err(|err| ApiError::InternalServerError(err.into()))?;
|
||||
|
||||
Response::builder()
|
||||
.status(200)
|
||||
|
||||
@@ -3,10 +3,9 @@ pub mod error;
|
||||
pub mod failpoints;
|
||||
pub mod json;
|
||||
pub mod request;
|
||||
pub mod server;
|
||||
|
||||
extern crate hyper0 as hyper;
|
||||
|
||||
/// Current fast way to apply simple http routing in various Neon binaries.
|
||||
/// Re-exported for sake of uniform approach, that could be later replaced with better alternatives, if needed.
|
||||
pub use routerify::{RequestServiceBuilder, RouterBuilder, RouterService, ext::RequestExt};
|
||||
pub use routerify::{RouterBuilder, RouterService, ext::RequestExt};
|
||||
|
||||
@@ -1,155 +0,0 @@
|
||||
use std::{error::Error, sync::Arc};
|
||||
|
||||
use futures::StreamExt;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use hyper0::Body;
|
||||
use hyper0::server::conn::Http;
|
||||
use routerify::{RequestService, RequestServiceBuilder};
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_rustls::TlsAcceptor;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, info};
|
||||
|
||||
use crate::error::ApiError;
|
||||
|
||||
/// A simple HTTP server over hyper library.
|
||||
/// You may want to use it instead of [`hyper0::server::Server`] because:
|
||||
/// 1. hyper0's Server was removed from hyper v1.
|
||||
/// It's recommended to replace hyepr0's Server with a manual loop, which is done here.
|
||||
/// 2. hyper0's Server doesn't support TLS out of the box, and there is no way
|
||||
/// to support it efficiently with the Accept trait that hyper0's Server uses.
|
||||
/// That's one of the reasons why it was removed from v1.
|
||||
/// <https://github.com/hyperium/hyper/blob/115339d3df50f20c8717680aa35f48858e9a6205/docs/ROADMAP.md#higher-level-client-and-server-problems>
|
||||
pub struct Server {
|
||||
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
|
||||
listener: tokio::net::TcpListener,
|
||||
tls_acceptor: Option<TlsAcceptor>,
|
||||
}
|
||||
|
||||
impl Server {
|
||||
pub fn new(
|
||||
request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
|
||||
listener: std::net::TcpListener,
|
||||
tls_acceptor: Option<TlsAcceptor>,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Note: caller of from_std is responsible for setting nonblocking mode.
|
||||
listener.set_nonblocking(true)?;
|
||||
let listener = tokio::net::TcpListener::from_std(listener)?;
|
||||
|
||||
Ok(Self {
|
||||
request_service,
|
||||
listener,
|
||||
tls_acceptor,
|
||||
})
|
||||
}
|
||||
|
||||
pub async fn serve(self, cancel: CancellationToken) -> anyhow::Result<()> {
|
||||
fn suppress_io_error(err: &std::io::Error) -> bool {
|
||||
use std::io::ErrorKind::*;
|
||||
matches!(err.kind(), ConnectionReset | ConnectionAborted | BrokenPipe)
|
||||
}
|
||||
fn suppress_hyper_error(err: &hyper0::Error) -> bool {
|
||||
if err.is_incomplete_message() || err.is_closed() || err.is_timeout() {
|
||||
return true;
|
||||
}
|
||||
if let Some(inner) = err.source() {
|
||||
if let Some(io) = inner.downcast_ref::<std::io::Error>() {
|
||||
return suppress_io_error(io);
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
let mut connections = FuturesUnordered::new();
|
||||
loop {
|
||||
tokio::select! {
|
||||
stream = self.listener.accept() => {
|
||||
let (tcp_stream, remote_addr) = match stream {
|
||||
Ok(stream) => stream,
|
||||
Err(err) => {
|
||||
if !suppress_io_error(&err) {
|
||||
info!("Failed to accept TCP connection: {err:#}");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let service = self.request_service.build(remote_addr);
|
||||
let tls_acceptor = self.tls_acceptor.clone();
|
||||
let cancel = cancel.clone();
|
||||
|
||||
connections.push(tokio::spawn(
|
||||
async move {
|
||||
match tls_acceptor {
|
||||
Some(tls_acceptor) => {
|
||||
// Handle HTTPS connection.
|
||||
let tls_stream = tokio::select! {
|
||||
tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
|
||||
_ = cancel.cancelled() => return,
|
||||
};
|
||||
let tls_stream = match tls_stream {
|
||||
Ok(tls_stream) => tls_stream,
|
||||
Err(err) => {
|
||||
if !suppress_io_error(&err) {
|
||||
info!("Failed to accept TLS connection: {err:#}");
|
||||
}
|
||||
return;
|
||||
}
|
||||
};
|
||||
if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!("Failed to serve HTTPS connection: {err:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// Handle HTTP connection.
|
||||
if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
|
||||
if !suppress_hyper_error(&err) {
|
||||
info!("Failed to serve HTTP connection: {err:#}");
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}));
|
||||
}
|
||||
Some(conn) = connections.next() => {
|
||||
if let Err(err) = conn {
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
_ = cancel.cancelled() => {
|
||||
// Wait for graceful shutdown of all connections.
|
||||
while let Some(conn) = connections.next().await {
|
||||
if let Err(err) = conn {
|
||||
error!("Connection panicked: {err:#}");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serves HTTP connection with graceful shutdown.
|
||||
async fn serve_connection<I>(
|
||||
io: I,
|
||||
service: RequestService<Body, ApiError>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), hyper0::Error>
|
||||
where
|
||||
I: AsyncRead + AsyncWrite + Unpin + Send + 'static,
|
||||
{
|
||||
let mut conn = Http::new().serve_connection(io, service).with_upgrades();
|
||||
|
||||
tokio::select! {
|
||||
res = &mut conn => res,
|
||||
_ = cancel.cancelled() => {
|
||||
Pin::new(&mut conn).graceful_shutdown();
|
||||
// Note: connection should still be awaited for graceful shutdown to complete.
|
||||
conn.await
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -34,6 +34,7 @@ postgres_backend.workspace = true
|
||||
nix = {workspace = true, optional = true}
|
||||
reqwest.workspace = true
|
||||
rand.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
bincode.workspace = true
|
||||
|
||||
@@ -35,7 +35,6 @@ pub struct NodeMetadata {
|
||||
pub postgres_port: u16,
|
||||
pub http_host: String,
|
||||
pub http_port: u16,
|
||||
pub https_port: Option<u16>,
|
||||
|
||||
// Deployment tools may write fields to the metadata file beyond what we
|
||||
// use in this type: this type intentionally only names fields that require.
|
||||
@@ -58,9 +57,6 @@ pub struct ConfigToml {
|
||||
// types mapped 1:1 into the runtime PageServerConfig type
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
pub listen_https_addr: Option<String>,
|
||||
pub ssl_key_file: Utf8PathBuf,
|
||||
pub ssl_cert_file: Utf8PathBuf,
|
||||
pub availability_zone: Option<String>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub wait_lsn_timeout: Duration,
|
||||
@@ -131,6 +127,7 @@ pub struct ConfigToml {
|
||||
pub load_previous_heatmap: Option<bool>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generate_unarchival_heatmap: Option<bool>,
|
||||
pub tracing: Option<Tracing>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -188,6 +185,58 @@ pub enum GetVectoredConcurrentIo {
|
||||
SidecarTask,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub struct Ratio {
|
||||
pub numerator: usize,
|
||||
pub denominator: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub struct OtelExporterConfig {
|
||||
pub endpoint: String,
|
||||
pub protocol: OtelExporterProtocol,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub timeout: Duration,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum OtelExporterProtocol {
|
||||
Grpc,
|
||||
HttpBinary,
|
||||
HttpJson,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub struct Tracing {
|
||||
pub sampling_ratio: Ratio,
|
||||
pub export_config: OtelExporterConfig,
|
||||
}
|
||||
|
||||
impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
|
||||
fn from(val: &OtelExporterConfig) -> Self {
|
||||
tracing_utils::ExportConfig {
|
||||
endpoint: Some(val.endpoint.clone()),
|
||||
protocol: val.protocol.into(),
|
||||
timeout: val.timeout,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<OtelExporterProtocol> for tracing_utils::Protocol {
|
||||
fn from(val: OtelExporterProtocol) -> Self {
|
||||
match val {
|
||||
OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc,
|
||||
OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson,
|
||||
OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub mod statvfs {
|
||||
pub mod mock {
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -425,9 +474,6 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
|
||||
utils::postgres_client::PostgresClientProtocol::Vanilla;
|
||||
|
||||
pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
|
||||
pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
|
||||
}
|
||||
|
||||
impl Default for ConfigToml {
|
||||
@@ -437,9 +483,6 @@ impl Default for ConfigToml {
|
||||
Self {
|
||||
listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||
listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||
listen_https_addr: (None),
|
||||
ssl_key_file: Utf8PathBuf::from(DEFAULT_SSL_KEY_FILE),
|
||||
ssl_cert_file: Utf8PathBuf::from(DEFAULT_SSL_CERT_FILE),
|
||||
availability_zone: (None),
|
||||
wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
|
||||
.expect("cannot parse default wait lsn timeout")),
|
||||
@@ -539,6 +582,7 @@ impl Default for ConfigToml {
|
||||
validate_wal_contiguity: None,
|
||||
load_previous_heatmap: None,
|
||||
generate_unarchival_heatmap: None,
|
||||
tracing: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,30 +16,6 @@ fn test_node_metadata_v1_backward_compatibilty() {
|
||||
postgres_port: 23,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: None,
|
||||
other: HashMap::new(),
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_node_metadata_v2_backward_compatibilty() {
|
||||
let v2 = serde_json::to_vec(&serde_json::json!({
|
||||
"host": "localhost",
|
||||
"port": 23,
|
||||
"http_host": "localhost",
|
||||
"http_port": 42,
|
||||
"https_port": 123,
|
||||
}));
|
||||
|
||||
assert_eq!(
|
||||
serde_json::from_slice::<NodeMetadata>(&v2.unwrap()).unwrap(),
|
||||
NodeMetadata {
|
||||
postgres_host: "localhost".to_string(),
|
||||
postgres_port: 23,
|
||||
http_host: "localhost".to_string(),
|
||||
http_port: 42,
|
||||
https_port: Some(123),
|
||||
other: HashMap::new(),
|
||||
}
|
||||
)
|
||||
|
||||
@@ -182,66 +182,20 @@ pub struct TenantDescribeResponseShard {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct TenantShardMigrateRequest {
|
||||
pub node_id: NodeId,
|
||||
|
||||
/// Optionally, callers may specify the node they are migrating _from_, and the server will
|
||||
/// reject the request if the shard is no longer attached there: this enables writing safer
|
||||
/// clients that don't risk fighting with some other movement of the shard.
|
||||
#[serde(default)]
|
||||
pub origin_node_id: Option<NodeId>,
|
||||
|
||||
#[serde(default)]
|
||||
pub migration_config: MigrationConfig,
|
||||
pub migration_config: Option<MigrationConfig>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct MigrationConfig {
|
||||
/// If true, the migration will be executed even if it is to a location with a sub-optimal scheduling
|
||||
/// score: this is usually not what you want, and if you use this then you'll also need to set the
|
||||
/// tenant's scheduling policy to Essential or Pause to avoid the optimiser reverting your migration.
|
||||
///
|
||||
/// Default: false
|
||||
#[serde(default)]
|
||||
pub override_scheduler: bool,
|
||||
|
||||
/// If true, the migration will be done gracefully by creating a secondary location first and
|
||||
/// waiting for it to warm up before cutting over. If false, if there is no existing secondary
|
||||
/// location at the destination, the tenant will be migrated immediately. If the tenant's data
|
||||
/// can't be downloaded within [`Self::secondary_warmup_timeout`], then the migration will go
|
||||
/// ahead but run with a cold cache that can severely reduce performance until it warms up.
|
||||
///
|
||||
/// When doing a graceful migration, the migration API returns as soon as it is started.
|
||||
///
|
||||
/// Default: true
|
||||
#[serde(default = "default_prewarm")]
|
||||
pub prewarm: bool,
|
||||
|
||||
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
|
||||
/// overall for secondary warmup before cutting over
|
||||
#[serde(default)]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub secondary_warmup_timeout: Option<Duration>,
|
||||
/// For non-prewarm migrations which will immediately enter a cutover to the new node: how long to wait
|
||||
/// within each secondary download poll call to pageserver.
|
||||
#[serde(default)]
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub secondary_download_request_timeout: Option<Duration>,
|
||||
}
|
||||
|
||||
fn default_prewarm() -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
impl Default for MigrationConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
override_scheduler: false,
|
||||
prewarm: default_prewarm(),
|
||||
secondary_warmup_timeout: None,
|
||||
secondary_download_request_timeout: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Debug)]
|
||||
#[serde(into = "NodeAvailabilityWrapper")]
|
||||
pub enum NodeAvailability {
|
||||
@@ -489,7 +443,6 @@ pub struct SafekeeperDescribeResponse {
|
||||
pub host: String,
|
||||
pub port: i32,
|
||||
pub http_port: i32,
|
||||
pub https_port: Option<i32>,
|
||||
pub availability_zone_id: String,
|
||||
pub scheduling_policy: SkSchedulingPolicy,
|
||||
}
|
||||
@@ -534,43 +487,4 @@ mod test {
|
||||
err
|
||||
);
|
||||
}
|
||||
|
||||
/// Check that a minimal migrate request with no config results in the expected default settings
|
||||
#[test]
|
||||
fn test_migrate_request_decode_defaults() {
|
||||
let json = r#"{
|
||||
"node_id": 123
|
||||
}"#;
|
||||
|
||||
let request: TenantShardMigrateRequest = serde_json::from_str(json).unwrap();
|
||||
assert_eq!(request.node_id, NodeId(123));
|
||||
assert_eq!(request.origin_node_id, None);
|
||||
assert!(!request.migration_config.override_scheduler);
|
||||
assert!(request.migration_config.prewarm);
|
||||
assert_eq!(request.migration_config.secondary_warmup_timeout, None);
|
||||
assert_eq!(
|
||||
request.migration_config.secondary_download_request_timeout,
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
/// Check that a partially specified migration config results in the expected default settings
|
||||
#[test]
|
||||
fn test_migration_config_decode_defaults() {
|
||||
// Specify just one field of the config
|
||||
let json = r#"{
|
||||
}"#;
|
||||
|
||||
let config: MigrationConfig = serde_json::from_str(json).unwrap();
|
||||
|
||||
// Check each field's expected default value
|
||||
assert!(!config.override_scheduler);
|
||||
assert!(config.prewarm);
|
||||
assert_eq!(config.secondary_warmup_timeout, None);
|
||||
assert_eq!(config.secondary_download_request_timeout, None);
|
||||
assert_eq!(config.secondary_warmup_timeout, None);
|
||||
|
||||
// Consistency check that the Default impl agrees with our serde defaults
|
||||
assert_eq!(MigrationConfig::default(), config);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1146,15 +1146,6 @@ pub struct TimelineArchivalConfigRequest {
|
||||
pub state: TimelineArchivalState,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
pub struct TimelinePatchIndexPartRequest {
|
||||
pub rel_size_migration: Option<RelSizeMigration>,
|
||||
pub gc_compaction_last_completed_lsn: Option<Lsn>,
|
||||
pub applied_gc_cutoff_lsn: Option<Lsn>,
|
||||
#[serde(default)]
|
||||
pub force_index_update: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelinesInfoAndOffloaded {
|
||||
pub timelines: Vec<TimelineInfo>,
|
||||
|
||||
@@ -396,14 +396,6 @@ pub mod waldecoder {
|
||||
self.lsn + self.inputbuf.remaining() as u64
|
||||
}
|
||||
|
||||
/// Returns the LSN up to which the WAL decoder has processed.
|
||||
///
|
||||
/// If [`Self::poll_decode`] returned a record, then this will return
|
||||
/// the end LSN of said record.
|
||||
pub fn lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
pub fn feed_bytes(&mut self, buf: &[u8]) {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
|
||||
@@ -208,6 +208,7 @@ pub(crate) fn ensure_logging_ready() {
|
||||
utils::logging::init(
|
||||
utils::logging::LogFormat::Test,
|
||||
utils::logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
utils::logging::Output::Stdout,
|
||||
)
|
||||
.expect("logging init failed");
|
||||
|
||||
@@ -14,6 +14,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
|
||||
tracing.workspace = true
|
||||
tracing-opentelemetry.workspace = true
|
||||
tracing-subscriber.workspace = true
|
||||
pin-project-lite.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
tracing-subscriber.workspace = true # For examples in docs
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
//! .with_writer(std::io::stderr);
|
||||
//!
|
||||
//! // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
|
||||
//! let otlp_layer = tracing_utils::init_tracing("my_application").await;
|
||||
//! let otlp_layer = tracing_utils::init_tracing("my_application", tracing_utils::ExportConfig::default()).await;
|
||||
//!
|
||||
//! // Put it all together
|
||||
//! tracing_subscriber::registry()
|
||||
@@ -31,13 +31,15 @@
|
||||
//! .init();
|
||||
//! }
|
||||
//! ```
|
||||
#![deny(unsafe_code)]
|
||||
#![deny(clippy::undocumented_unsafe_blocks)]
|
||||
|
||||
pub mod http;
|
||||
pub mod perf_span;
|
||||
|
||||
use opentelemetry::KeyValue;
|
||||
use opentelemetry::trace::TracerProvider;
|
||||
use opentelemetry_otlp::WithExportConfig;
|
||||
pub use opentelemetry_otlp::{ExportConfig, Protocol};
|
||||
use tracing::Subscriber;
|
||||
use tracing_subscriber::Layer;
|
||||
use tracing_subscriber::registry::LookupSpan;
|
||||
@@ -69,19 +71,28 @@ use tracing_subscriber::registry::LookupSpan;
|
||||
///
|
||||
/// This doesn't block, but is marked as 'async' to hint that this must be called in
|
||||
/// asynchronous execution context.
|
||||
pub async fn init_tracing<S>(service_name: &str) -> Option<impl Layer<S>>
|
||||
pub async fn init_tracing<S>(
|
||||
service_name: &str,
|
||||
export_config: ExportConfig,
|
||||
) -> Option<impl Layer<S>>
|
||||
where
|
||||
S: Subscriber + for<'span> LookupSpan<'span>,
|
||||
{
|
||||
if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
|
||||
return None;
|
||||
};
|
||||
Some(init_tracing_internal(service_name.to_string()))
|
||||
Some(init_tracing_internal(
|
||||
service_name.to_string(),
|
||||
export_config,
|
||||
))
|
||||
}
|
||||
|
||||
/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
|
||||
/// tasks.
|
||||
pub fn init_tracing_without_runtime<S>(service_name: &str) -> Option<impl Layer<S>>
|
||||
pub fn init_tracing_without_runtime<S>(
|
||||
service_name: &str,
|
||||
export_config: ExportConfig,
|
||||
) -> Option<impl Layer<S>>
|
||||
where
|
||||
S: Subscriber + for<'span> LookupSpan<'span>,
|
||||
{
|
||||
@@ -112,16 +123,20 @@ where
|
||||
));
|
||||
let _guard = runtime.enter();
|
||||
|
||||
Some(init_tracing_internal(service_name.to_string()))
|
||||
Some(init_tracing_internal(
|
||||
service_name.to_string(),
|
||||
export_config,
|
||||
))
|
||||
}
|
||||
|
||||
fn init_tracing_internal<S>(service_name: String) -> impl Layer<S>
|
||||
fn init_tracing_internal<S>(service_name: String, export_config: ExportConfig) -> impl Layer<S>
|
||||
where
|
||||
S: Subscriber + for<'span> LookupSpan<'span>,
|
||||
{
|
||||
// Sets up exporter from the OTEL_EXPORTER_* environment variables.
|
||||
let exporter = opentelemetry_otlp::SpanExporter::builder()
|
||||
.with_http()
|
||||
.with_export_config(export_config)
|
||||
.build()
|
||||
.expect("could not initialize opentelemetry exporter");
|
||||
|
||||
|
||||
149
libs/tracing-utils/src/perf_span.rs
Normal file
149
libs/tracing-utils/src/perf_span.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
//! Crutch module to work around tracing infrastructure deficiencies
|
||||
//!
|
||||
//! We wish to collect granular request spans without impacting performance
|
||||
//! by much. Ideally, we should have zero overhead for a sampling rate of 0.
|
||||
//!
|
||||
//! The approach taken by the pageserver crate is to use a completely different
|
||||
//! span hierarchy for the performance spans. Spans are explicitly stored in
|
||||
//! the request context and use a different [`tracing::Subscriber`] in order
|
||||
//! to avoid expensive filtering.
|
||||
//!
|
||||
//! [`tracing::Span`] instances record their [`tracing::Dispatch`] and, implcitly,
|
||||
//! their [`tracing::Subscriber`] at creation time. However, upon exiting the span,
|
||||
//! the global default [`tracing::Dispatch`] is used. This is problematic if one
|
||||
//! wishes to juggle different subscribers.
|
||||
//!
|
||||
//! In order to work around this, this module provides a [`PerfSpan`] type which
|
||||
//! wraps a [`Span`] and sets the default subscriber when exiting the span. This
|
||||
//! achieves the correct routing.
|
||||
//!
|
||||
//! There's also a modified version of [`tracing::Instrument`] which works with
|
||||
//! [`PerfSpan`].
|
||||
|
||||
use core::{
|
||||
future::Future,
|
||||
marker::Sized,
|
||||
mem::ManuallyDrop,
|
||||
pin::Pin,
|
||||
task::{Context, Poll},
|
||||
};
|
||||
use pin_project_lite::pin_project;
|
||||
use tracing::{Dispatch, field, span::Span};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PerfSpan {
|
||||
inner: ManuallyDrop<Span>,
|
||||
dispatch: Dispatch,
|
||||
}
|
||||
|
||||
#[must_use = "once a span has been entered, it should be exited"]
|
||||
pub struct PerfSpanEntered<'a> {
|
||||
span: &'a PerfSpan,
|
||||
}
|
||||
|
||||
impl PerfSpan {
|
||||
pub fn new(span: Span, dispatch: Dispatch) -> Self {
|
||||
Self {
|
||||
inner: ManuallyDrop::new(span),
|
||||
dispatch,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record<Q: field::AsField + ?Sized, V: field::Value>(
|
||||
&self,
|
||||
field: &Q,
|
||||
value: V,
|
||||
) -> &Self {
|
||||
self.inner.record(field, value);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn enter(&self) -> PerfSpanEntered {
|
||||
PerfSpanEntered { span: self }
|
||||
}
|
||||
|
||||
pub fn inner(&self) -> &Span {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PerfSpan {
|
||||
fn drop(&mut self) {
|
||||
// Bring the desired dispatch into scope before explicitly calling
|
||||
// the span destructor. This routes the span exit to the correct
|
||||
// [`tracing::Subscriber`].
|
||||
let _dispatch_guard = tracing::dispatcher::set_default(&self.dispatch);
|
||||
// SAFETY: ManuallyDrop in Drop implementation
|
||||
unsafe { ManuallyDrop::drop(&mut self.inner) }
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PerfSpanEntered<'_> {
|
||||
fn drop(&mut self) {
|
||||
assert!(self.span.inner.id().is_some());
|
||||
|
||||
let _dispatch_guard = tracing::dispatcher::set_default(&self.span.dispatch);
|
||||
self.span.dispatch.exit(&self.span.inner.id().unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
pub trait PerfInstrument: Sized {
|
||||
fn instrument(self, span: PerfSpan) -> PerfInstrumented<Self> {
|
||||
PerfInstrumented {
|
||||
inner: ManuallyDrop::new(self),
|
||||
span,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pin_project! {
|
||||
#[project = PerfInstrumentedProj]
|
||||
#[derive(Debug, Clone)]
|
||||
#[must_use = "futures do nothing unless you `.await` or poll them"]
|
||||
pub struct PerfInstrumented<T> {
|
||||
// `ManuallyDrop` is used here to to enter instrument `Drop` by entering
|
||||
// `Span` and executing `ManuallyDrop::drop`.
|
||||
#[pin]
|
||||
inner: ManuallyDrop<T>,
|
||||
span: PerfSpan,
|
||||
}
|
||||
|
||||
impl<T> PinnedDrop for PerfInstrumented<T> {
|
||||
fn drop(this: Pin<&mut Self>) {
|
||||
let this = this.project();
|
||||
let _enter = this.span.enter();
|
||||
// SAFETY: 1. `Pin::get_unchecked_mut()` is safe, because this isn't
|
||||
// different from wrapping `T` in `Option` and calling
|
||||
// `Pin::set(&mut this.inner, None)`, except avoiding
|
||||
// additional memory overhead.
|
||||
// 2. `ManuallyDrop::drop()` is safe, because
|
||||
// `PinnedDrop::drop()` is guaranteed to be called only
|
||||
// once.
|
||||
unsafe { ManuallyDrop::drop(this.inner.get_unchecked_mut()) }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> PerfInstrumentedProj<'a, T> {
|
||||
/// Get a mutable reference to the [`Span`] a pinned mutable reference to
|
||||
/// the wrapped type.
|
||||
fn span_and_inner_pin_mut(self) -> (&'a mut PerfSpan, Pin<&'a mut T>) {
|
||||
// SAFETY: As long as `ManuallyDrop<T>` does not move, `T` won't move
|
||||
// and `inner` is valid, because `ManuallyDrop::drop` is called
|
||||
// only inside `Drop` of the `Instrumented`.
|
||||
let inner = unsafe { self.inner.map_unchecked_mut(|v| &mut **v) };
|
||||
(self.span, inner)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Future> Future for PerfInstrumented<T> {
|
||||
type Output = T::Output;
|
||||
|
||||
fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
|
||||
let (span, inner) = self.project().span_and_inner_pin_mut();
|
||||
let _enter = span.enter();
|
||||
inner.poll(cx)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Sized> PerfInstrument for T {}
|
||||
@@ -42,6 +42,7 @@ toml_edit = { workspace = true, features = ["serde"] }
|
||||
tracing.workspace = true
|
||||
tracing-error.workspace = true
|
||||
tracing-subscriber = { workspace = true, features = ["json", "registry"] }
|
||||
tracing-utils.workspace = true
|
||||
rand.workspace = true
|
||||
scopeguard.workspace = true
|
||||
strum.workspace = true
|
||||
|
||||
@@ -7,7 +7,9 @@ use metrics::{IntCounter, IntCounterVec};
|
||||
use once_cell::sync::Lazy;
|
||||
use strum_macros::{EnumString, VariantNames};
|
||||
use tokio::time::Instant;
|
||||
use tracing::Dispatch;
|
||||
use tracing::info;
|
||||
use tracing::level_filters::LevelFilter;
|
||||
|
||||
/// Logs a critical error, similarly to `tracing::error!`. This will:
|
||||
///
|
||||
@@ -125,6 +127,15 @@ pub enum TracingErrorLayerEnablement {
|
||||
EnableWithRustLogFilter,
|
||||
}
|
||||
|
||||
pub enum OtelEnablement {
|
||||
Disabled,
|
||||
Enabled {
|
||||
service_name: String,
|
||||
export_config: tracing_utils::ExportConfig,
|
||||
runtime: &'static tokio::runtime::Runtime,
|
||||
},
|
||||
}
|
||||
|
||||
/// Where the logging should output to.
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum Output {
|
||||
@@ -132,11 +143,24 @@ pub enum Output {
|
||||
Stderr,
|
||||
}
|
||||
|
||||
pub struct OtelGuard {
|
||||
pub dispatch: Dispatch,
|
||||
}
|
||||
|
||||
impl Drop for OtelGuard {
|
||||
fn drop(&mut self) {
|
||||
tracing_utils::shutdown_tracing();
|
||||
}
|
||||
}
|
||||
|
||||
pub const PERF_TRACE_TARGET: &str = "P";
|
||||
|
||||
pub fn init(
|
||||
log_format: LogFormat,
|
||||
tracing_error_layer_enablement: TracingErrorLayerEnablement,
|
||||
otel_enablement: OtelEnablement,
|
||||
output: Output,
|
||||
) -> anyhow::Result<()> {
|
||||
) -> anyhow::Result<Option<OtelGuard>> {
|
||||
// We fall back to printing all spans at info-level or above if
|
||||
// the RUST_LOG environment variable is not set.
|
||||
let rust_log_env_filter = || {
|
||||
@@ -165,6 +189,7 @@ pub fn init(
|
||||
};
|
||||
log_layer.with_filter(rust_log_env_filter())
|
||||
});
|
||||
|
||||
let r = r.with(
|
||||
TracingEventCountLayer(&TRACING_EVENT_COUNT_METRIC).with_filter(rust_log_env_filter()),
|
||||
);
|
||||
@@ -175,7 +200,26 @@ pub fn init(
|
||||
TracingErrorLayerEnablement::Disabled => r.init(),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
let otel_subscriber = match otel_enablement {
|
||||
OtelEnablement::Disabled => None,
|
||||
OtelEnablement::Enabled {
|
||||
service_name,
|
||||
export_config,
|
||||
runtime,
|
||||
} => {
|
||||
let otel_layer = runtime
|
||||
.block_on(tracing_utils::init_tracing(&service_name, export_config))
|
||||
.with_filter(LevelFilter::INFO);
|
||||
let otel_subscriber = tracing_subscriber::registry().with(otel_layer);
|
||||
let otel_dispatch = Dispatch::new(otel_subscriber);
|
||||
|
||||
Some(otel_dispatch)
|
||||
}
|
||||
};
|
||||
|
||||
let otel_guard = otel_subscriber.map(|dispatch| OtelGuard { dispatch });
|
||||
|
||||
Ok(otel_guard)
|
||||
}
|
||||
|
||||
/// Disable the default rust panic hook by using `set_hook`.
|
||||
|
||||
@@ -48,9 +48,6 @@ pprof.workspace = true
|
||||
rand.workspace = true
|
||||
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
|
||||
regex.workspace = true
|
||||
rustls-pemfile.workspace = true
|
||||
rustls-pki-types.workspace = true
|
||||
rustls.workspace = true
|
||||
scopeguard.workspace = true
|
||||
send-future.workspace = true
|
||||
serde.workspace = true
|
||||
@@ -65,11 +62,11 @@ tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util"
|
||||
tokio-epoll-uring.workspace = true
|
||||
tokio-io-timeout.workspace = true
|
||||
tokio-postgres.workspace = true
|
||||
tokio-rustls.workspace = true
|
||||
tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
tracing-utils.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
@@ -102,7 +99,6 @@ criterion.workspace = true
|
||||
hex-literal.workspace = true
|
||||
tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
|
||||
indoc.workspace = true
|
||||
uuid.workspace = true
|
||||
|
||||
[[bench]]
|
||||
name = "bench_layer_map"
|
||||
|
||||
@@ -7,7 +7,7 @@ use http_utils::error::HttpErrorBody;
|
||||
use pageserver_api::models::*;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
pub use reqwest::Body as ReqwestBody;
|
||||
use reqwest::{Certificate, IntoUrl, Method, StatusCode};
|
||||
use reqwest::{IntoUrl, Method, StatusCode};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -38,9 +38,6 @@ pub enum Error {
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
#[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
|
||||
CreateClient(reqwest::Error),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
@@ -72,17 +69,8 @@ pub enum ForceAwaitLogicalSize {
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new(
|
||||
mgmt_api_endpoint: String,
|
||||
jwt: Option<&str>,
|
||||
ssl_ca_cert: Option<Certificate>,
|
||||
) -> Result<Self> {
|
||||
let mut http_client = reqwest::Client::builder();
|
||||
if let Some(ssl_ca_cert) = ssl_ca_cert {
|
||||
http_client = http_client.add_root_certificate(ssl_ca_cert);
|
||||
}
|
||||
let http_client = http_client.build().map_err(Error::CreateClient)?;
|
||||
Ok(Self::from_client(http_client, mgmt_api_endpoint, jwt))
|
||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
|
||||
Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
|
||||
}
|
||||
|
||||
pub fn from_client(
|
||||
@@ -113,10 +101,12 @@ impl Client {
|
||||
debug_assert!(path.starts_with('/'));
|
||||
let uri = format!("{}{}", self.mgmt_api_endpoint, path);
|
||||
|
||||
let mut req = self.client.request(Method::GET, uri);
|
||||
if let Some(value) = &self.authorization_header {
|
||||
req = req.header(reqwest::header::AUTHORIZATION, value);
|
||||
}
|
||||
let req = self.client.request(Method::GET, uri);
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value)
|
||||
} else {
|
||||
req
|
||||
};
|
||||
req.send().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
@@ -471,23 +461,6 @@ impl Client {
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_detach_ancestor_v2(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
) -> Result<AncestorDetached> {
|
||||
let uri = format!(
|
||||
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor_v2",
|
||||
self.mgmt_api_endpoint
|
||||
);
|
||||
|
||||
self.request(Method::PUT, &uri, ())
|
||||
.await?
|
||||
.json()
|
||||
.await
|
||||
.map_err(Error::ReceiveBody)
|
||||
}
|
||||
|
||||
pub async fn timeline_block_unblock_gc(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
|
||||
@@ -10,9 +10,10 @@ pub(crate) fn setup_logging() {
|
||||
logging::init(
|
||||
logging::LogFormat::Test,
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)
|
||||
.expect("Failed to init test logging")
|
||||
.expect("Failed to init test logging");
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -117,6 +117,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::Plain,
|
||||
TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
|
||||
@@ -15,7 +15,6 @@ hdrhistogram.workspace = true
|
||||
humantime.workspace = true
|
||||
humantime-serde.workspace = true
|
||||
rand.workspace = true
|
||||
reqwest.workspace=true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
@@ -36,8 +36,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
None, // TODO: support ssl_ca_file for https APIs in pagebench.
|
||||
)?);
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
|
||||
@@ -77,8 +77,7 @@ async fn main_impl(
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
None, // TODO: support ssl_ca_file for https APIs in pagebench.
|
||||
)?);
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
|
||||
@@ -125,8 +125,7 @@ async fn main_impl(
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
None, // TODO: support ssl_ca_file for https APIs in pagebench.
|
||||
)?);
|
||||
));
|
||||
|
||||
if let Some(engine_str) = &args.set_io_engine {
|
||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||
|
||||
@@ -83,8 +83,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
None, // TODO: support ssl_ca_file for https APIs in pagebench.
|
||||
)?);
|
||||
));
|
||||
|
||||
if let Some(engine_str) = &args.set_io_engine {
|
||||
mgmt_api_client.put_io_engine(engine_str).await?;
|
||||
|
||||
@@ -40,8 +40,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
|
||||
args.mgmt_api_endpoint.clone(),
|
||||
args.pageserver_jwt.as_deref(),
|
||||
None, // TODO: support ssl_ca_file for https APIs in pagebench.
|
||||
)?);
|
||||
));
|
||||
|
||||
// discover targets
|
||||
let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
|
||||
|
||||
@@ -35,6 +35,7 @@ fn main() {
|
||||
logging::init(
|
||||
logging::LogFormat::Plain,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
logging::Output::Stderr,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -21,23 +21,23 @@ use pageserver::deletion_queue::DeletionQueue;
|
||||
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
|
||||
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
|
||||
use pageserver::task_mgr::{
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
|
||||
BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, OTEL_RUNTIME,
|
||||
WALRECEIVER_RUNTIME,
|
||||
};
|
||||
use pageserver::tenant::{TenantSharedResources, mgr, secondary};
|
||||
use pageserver::{
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
|
||||
page_cache, page_service, task_mgr, virtual_file,
|
||||
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, http, page_cache, page_service,
|
||||
task_mgr, virtual_file,
|
||||
};
|
||||
use postgres_backend::AuthType;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use rustls_pki_types::{CertificateDer, PrivateKeyDer};
|
||||
use tokio::signal::unix::SignalKind;
|
||||
use tokio::time::Instant;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
use utils::auth::{JwtAuth, SwappableJwtAuth};
|
||||
use utils::crashsafe::syncfs;
|
||||
use utils::logging::TracingErrorLayerEnablement;
|
||||
use utils::logging::{OtelGuard, TracingErrorLayerEnablement};
|
||||
use utils::sentry_init::init_sentry;
|
||||
use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
|
||||
|
||||
@@ -111,12 +111,27 @@ fn main() -> anyhow::Result<()> {
|
||||
} else {
|
||||
TracingErrorLayerEnablement::Disabled
|
||||
};
|
||||
logging::init(
|
||||
|
||||
let otel_enablement = match &conf.tracing {
|
||||
Some(cfg) => utils::logging::OtelEnablement::Enabled {
|
||||
service_name: "pageserver".to_string(),
|
||||
export_config: (&cfg.export_config).into(),
|
||||
runtime: *OTEL_RUNTIME,
|
||||
},
|
||||
None => utils::logging::OtelEnablement::Disabled,
|
||||
};
|
||||
|
||||
let otel_guard = logging::init(
|
||||
conf.log_format,
|
||||
tracing_error_layer_enablement,
|
||||
otel_enablement,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
|
||||
if otel_guard.is_some() {
|
||||
info!(?conf.tracing, "starting with OTEL tracing enabled");
|
||||
}
|
||||
|
||||
// mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
|
||||
// disarming this hook on pageserver, because we never tear down tracing.
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
@@ -190,7 +205,7 @@ fn main() -> anyhow::Result<()> {
|
||||
tracing::info!("Initializing page_cache...");
|
||||
page_cache::init(conf.page_cache_size);
|
||||
|
||||
start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
|
||||
start_pageserver(launch_ts, conf, otel_guard).context("Failed to start pageserver")?;
|
||||
|
||||
scenario.teardown();
|
||||
Ok(())
|
||||
@@ -288,6 +303,7 @@ fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
|
||||
fn start_pageserver(
|
||||
launch_ts: &'static LaunchTimestamp,
|
||||
conf: &'static PageServerConf,
|
||||
otel_guard: Option<OtelGuard>,
|
||||
) -> anyhow::Result<()> {
|
||||
// Monotonic time for later calculating startup duration
|
||||
let started_startup_at = Instant::now();
|
||||
@@ -344,15 +360,8 @@ fn start_pageserver(
|
||||
info!("Starting pageserver http handler on {http_addr}");
|
||||
let http_listener = tcp_listener::bind(http_addr)?;
|
||||
|
||||
let https_listener = match conf.listen_https_addr.as_ref() {
|
||||
Some(https_addr) => {
|
||||
info!("Starting pageserver https handler on {https_addr}");
|
||||
Some(tcp_listener::bind(https_addr)?)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let pg_addr = &conf.listen_pg_addr;
|
||||
|
||||
info!("Starting pageserver pg protocol handler on {pg_addr}");
|
||||
let pageserver_listener = tcp_listener::bind(pg_addr)?;
|
||||
|
||||
@@ -583,8 +592,9 @@ fn start_pageserver(
|
||||
|
||||
// Start up the service to handle HTTP mgmt API request. We created the
|
||||
// listener earlier already.
|
||||
let (http_endpoint_listener, https_endpoint_listener) = {
|
||||
let http_endpoint_listener = {
|
||||
let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let router_state = Arc::new(
|
||||
http::routes::State::new(
|
||||
@@ -599,51 +609,22 @@ fn start_pageserver(
|
||||
)
|
||||
.context("Failed to initialize router state")?,
|
||||
);
|
||||
|
||||
let router = http::make_router(router_state, launch_ts, http_auth.clone())?
|
||||
.build()
|
||||
.map_err(|err| anyhow!(err))?;
|
||||
let service = http_utils::RouterService::new(router).unwrap();
|
||||
let server = hyper0::Server::from_tcp(http_listener)?
|
||||
.serve(service)
|
||||
.with_graceful_shutdown({
|
||||
let cancel = cancel.clone();
|
||||
async move { cancel.clone().cancelled().await }
|
||||
});
|
||||
|
||||
let service =
|
||||
Arc::new(http_utils::RequestServiceBuilder::new(router).map_err(|err| anyhow!(err))?);
|
||||
|
||||
let http_task = {
|
||||
let server =
|
||||
http_utils::server::Server::new(Arc::clone(&service), http_listener, None)?;
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"http endpoint listener",
|
||||
server.serve(cancel.clone()),
|
||||
));
|
||||
HttpEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
|
||||
let https_task = match https_listener {
|
||||
Some(https_listener) => {
|
||||
let certs = load_certs(&conf.ssl_cert_file)?;
|
||||
let key = load_private_key(&conf.ssl_key_file)?;
|
||||
|
||||
let server_config = rustls::ServerConfig::builder()
|
||||
.with_no_client_auth()
|
||||
.with_single_cert(certs, key)?;
|
||||
|
||||
let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
|
||||
|
||||
let server =
|
||||
http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"https endpoint listener",
|
||||
server.serve(cancel.clone()),
|
||||
));
|
||||
Some(HttpsEndpointListener(CancellableTask { task, cancel }))
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
(http_task, https_task)
|
||||
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
|
||||
"http endpoint listener",
|
||||
server,
|
||||
));
|
||||
HttpEndpointListener(CancellableTask { task, cancel })
|
||||
};
|
||||
|
||||
let consumption_metrics_tasks = {
|
||||
@@ -670,13 +651,21 @@ fn start_pageserver(
|
||||
|
||||
// Spawn a task to listen for libpq connections. It will spawn further tasks
|
||||
// for each connection. We created the listener earlier already.
|
||||
let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
|
||||
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
|
||||
pageserver_listener
|
||||
.set_nonblocking(true)
|
||||
.context("set listener to nonblocking")?;
|
||||
tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
|
||||
});
|
||||
let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone());
|
||||
let page_service = page_service::spawn(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
pg_auth,
|
||||
perf_trace_dispatch,
|
||||
{
|
||||
let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
|
||||
pageserver_listener
|
||||
.set_nonblocking(true)
|
||||
.context("set listener to nonblocking")?;
|
||||
tokio::net::TcpListener::from_std(pageserver_listener)
|
||||
.context("create tokio listener")?
|
||||
},
|
||||
);
|
||||
|
||||
// All started up! Now just sit and wait for shutdown signal.
|
||||
BACKGROUND_RUNTIME.block_on(async move {
|
||||
@@ -719,7 +708,6 @@ fn start_pageserver(
|
||||
shutdown_pageserver.cancel();
|
||||
pageserver::shutdown_pageserver(
|
||||
http_endpoint_listener,
|
||||
https_endpoint_listener,
|
||||
page_service,
|
||||
consumption_metrics_tasks,
|
||||
disk_usage_eviction_task,
|
||||
@@ -734,25 +722,6 @@ fn start_pageserver(
|
||||
})
|
||||
}
|
||||
|
||||
fn load_certs(filename: &Utf8Path) -> std::io::Result<Vec<CertificateDer<'static>>> {
|
||||
let file = std::fs::File::open(filename)?;
|
||||
let mut reader = std::io::BufReader::new(file);
|
||||
|
||||
rustls_pemfile::certs(&mut reader).collect()
|
||||
}
|
||||
|
||||
fn load_private_key(filename: &Utf8Path) -> anyhow::Result<PrivateKeyDer<'static>> {
|
||||
let file = std::fs::File::open(filename)?;
|
||||
let mut reader = std::io::BufReader::new(file);
|
||||
|
||||
let key = rustls_pemfile::private_key(&mut reader)?;
|
||||
|
||||
key.ok_or(anyhow::anyhow!(
|
||||
"no private key found in {}",
|
||||
filename.as_str(),
|
||||
))
|
||||
}
|
||||
|
||||
async fn create_remote_storage_client(
|
||||
conf: &'static PageServerConf,
|
||||
) -> anyhow::Result<GenericRemoteStorage> {
|
||||
|
||||
@@ -53,11 +53,6 @@ pub struct PageServerConf {
|
||||
pub listen_pg_addr: String,
|
||||
/// Example (default): 127.0.0.1:9898
|
||||
pub listen_http_addr: String,
|
||||
/// Example: 127.0.0.1:9899
|
||||
pub listen_https_addr: Option<String>,
|
||||
|
||||
pub ssl_key_file: Utf8PathBuf,
|
||||
pub ssl_cert_file: Utf8PathBuf,
|
||||
|
||||
/// Current availability zone. Used for traffic metrics.
|
||||
pub availability_zone: Option<String>,
|
||||
@@ -206,6 +201,8 @@ pub struct PageServerConf {
|
||||
|
||||
/// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
|
||||
pub generate_unarchival_heatmap: bool,
|
||||
|
||||
pub tracing: Option<pageserver_api::config::Tracing>,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -322,9 +319,6 @@ impl PageServerConf {
|
||||
let pageserver_api::config::ConfigToml {
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
ssl_key_file,
|
||||
ssl_cert_file,
|
||||
availability_zone,
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
@@ -375,6 +369,7 @@ impl PageServerConf {
|
||||
validate_wal_contiguity,
|
||||
load_previous_heatmap,
|
||||
generate_unarchival_heatmap,
|
||||
tracing,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -383,9 +378,6 @@ impl PageServerConf {
|
||||
// ------------------------------------------------------------
|
||||
listen_pg_addr,
|
||||
listen_http_addr,
|
||||
listen_https_addr,
|
||||
ssl_key_file,
|
||||
ssl_cert_file,
|
||||
availability_zone,
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
@@ -423,6 +415,7 @@ impl PageServerConf {
|
||||
wal_receiver_protocol,
|
||||
page_service_pipelining,
|
||||
get_vectored_concurrent_io,
|
||||
tracing,
|
||||
|
||||
// ------------------------------------------------------------
|
||||
// fields that require additional validation or custom handling
|
||||
@@ -467,8 +460,8 @@ impl PageServerConf {
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
|
||||
validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
|
||||
load_previous_heatmap: load_previous_heatmap.unwrap_or(true),
|
||||
generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true),
|
||||
load_previous_heatmap: load_previous_heatmap.unwrap_or(false),
|
||||
generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
@@ -487,6 +480,17 @@ impl PageServerConf {
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(tracing_config) = conf.tracing.as_ref() {
|
||||
let ratio = &tracing_config.sampling_ratio;
|
||||
ensure!(
|
||||
ratio.denominator != 0 && ratio.denominator >= ratio.numerator,
|
||||
format!(
|
||||
"Invalid sampling ratio: {}/{}",
|
||||
ratio.numerator, ratio.denominator
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
|
||||
.map_err(anyhow::Error::msg)
|
||||
.with_context(|| {
|
||||
@@ -502,9 +506,7 @@ impl PageServerConf {
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
|
||||
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
|
||||
|
||||
let test_id = uuid::Uuid::new_v4();
|
||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
|
||||
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
|
||||
}
|
||||
|
||||
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
|
||||
|
||||
@@ -89,16 +89,38 @@
|
||||
//! [`RequestContext`] argument. Functions in the middle of the call chain
|
||||
//! only need to pass it on.
|
||||
|
||||
use futures::FutureExt;
|
||||
use futures::future::BoxFuture;
|
||||
use std::future::Future;
|
||||
use tracing_utils::perf_span::{PerfInstrument, PerfSpan};
|
||||
|
||||
use tracing::{Dispatch, Span};
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
|
||||
// The main structure of this module, see module-level comment.
|
||||
#[derive(Debug)]
|
||||
#[derive(Clone)]
|
||||
pub struct RequestContext {
|
||||
task_kind: TaskKind,
|
||||
download_behavior: DownloadBehavior,
|
||||
access_stats_behavior: AccessStatsBehavior,
|
||||
page_content_kind: PageContentKind,
|
||||
read_path_debug: bool,
|
||||
perf_span: Option<PerfSpan>,
|
||||
perf_span_dispatch: Option<Dispatch>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for RequestContext {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("RequestContext")
|
||||
.field("task_kind", &self.task_kind)
|
||||
.field("download_behavior", &self.download_behavior)
|
||||
.field("access_stats_behavior", &self.access_stats_behavior)
|
||||
.field("page_content_kind", &self.page_content_kind)
|
||||
.field("read_path_debug", &self.read_path_debug)
|
||||
// perf_span and perf_span_dispatch are omitted on purpose
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// The kind of access to the page cache.
|
||||
@@ -157,24 +179,23 @@ impl RequestContextBuilder {
|
||||
access_stats_behavior: AccessStatsBehavior::Update,
|
||||
page_content_kind: PageContentKind::Unknown,
|
||||
read_path_debug: false,
|
||||
perf_span: None,
|
||||
perf_span_dispatch: None,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extend(original: &RequestContext) -> Self {
|
||||
pub fn from(original: &RequestContext) -> Self {
|
||||
Self {
|
||||
// This is like a Copy, but avoid implementing Copy because ordinary users of
|
||||
// RequestContext should always move or ref it.
|
||||
inner: RequestContext {
|
||||
task_kind: original.task_kind,
|
||||
download_behavior: original.download_behavior,
|
||||
access_stats_behavior: original.access_stats_behavior,
|
||||
page_content_kind: original.page_content_kind,
|
||||
read_path_debug: original.read_path_debug,
|
||||
},
|
||||
inner: original.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn task_kind(mut self, b: TaskKind) -> Self {
|
||||
self.inner.task_kind = b;
|
||||
self
|
||||
}
|
||||
|
||||
/// Configure the DownloadBehavior of the context: whether to
|
||||
/// download missing layers, and/or warn on the download.
|
||||
pub fn download_behavior(mut self, b: DownloadBehavior) -> Self {
|
||||
@@ -199,7 +220,52 @@ impl RequestContextBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> RequestContext {
|
||||
pub(crate) fn perf_span_dispatch(mut self, dispatch: Option<Dispatch>) -> Self {
|
||||
self.inner.perf_span_dispatch = dispatch;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn root_perf_span<Fn>(mut self, make_span: Fn) -> Self
|
||||
where
|
||||
Fn: FnOnce() -> Span,
|
||||
{
|
||||
assert!(self.inner.perf_span.is_none());
|
||||
assert!(self.inner.perf_span_dispatch.is_some());
|
||||
|
||||
let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap();
|
||||
let new_span = tracing::dispatcher::with_default(dispatcher, make_span);
|
||||
|
||||
self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone()));
|
||||
|
||||
self
|
||||
}
|
||||
|
||||
pub fn perf_span<Fn>(mut self, make_span: Fn) -> Self
|
||||
where
|
||||
Fn: FnOnce(&Span) -> Span,
|
||||
{
|
||||
if let Some(ref perf_span) = self.inner.perf_span {
|
||||
assert!(self.inner.perf_span_dispatch.is_some());
|
||||
let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap();
|
||||
|
||||
let new_span =
|
||||
tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner()));
|
||||
|
||||
self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone()));
|
||||
}
|
||||
|
||||
self
|
||||
}
|
||||
|
||||
pub fn root(self) -> RequestContext {
|
||||
self.inner
|
||||
}
|
||||
|
||||
pub fn attached_child(self) -> RequestContext {
|
||||
self.inner
|
||||
}
|
||||
|
||||
pub fn detached_child(self) -> RequestContext {
|
||||
self.inner
|
||||
}
|
||||
}
|
||||
@@ -220,7 +286,7 @@ impl RequestContext {
|
||||
pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
RequestContextBuilder::new(task_kind)
|
||||
.download_behavior(download_behavior)
|
||||
.build()
|
||||
.root()
|
||||
}
|
||||
|
||||
/// Create a detached child context for a task that may outlive `self`.
|
||||
@@ -241,7 +307,10 @@ impl RequestContext {
|
||||
///
|
||||
/// We could make new calls to this function fail if `self` is already canceled.
|
||||
pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
self.child_impl(task_kind, download_behavior)
|
||||
RequestContextBuilder::from(self)
|
||||
.task_kind(task_kind)
|
||||
.download_behavior(download_behavior)
|
||||
.detached_child()
|
||||
}
|
||||
|
||||
/// Create a child of context `self` for a task that shall not outlive `self`.
|
||||
@@ -265,7 +334,7 @@ impl RequestContext {
|
||||
/// The method to wait for child tasks would return an error, indicating
|
||||
/// that the child task was not started because the context was canceled.
|
||||
pub fn attached_child(&self) -> Self {
|
||||
self.child_impl(self.task_kind(), self.download_behavior())
|
||||
RequestContextBuilder::from(self).attached_child()
|
||||
}
|
||||
|
||||
/// Use this function when you should be creating a child context using
|
||||
@@ -280,10 +349,6 @@ impl RequestContext {
|
||||
Self::new(task_kind, download_behavior)
|
||||
}
|
||||
|
||||
fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
|
||||
Self::new(task_kind, download_behavior)
|
||||
}
|
||||
|
||||
pub fn task_kind(&self) -> TaskKind {
|
||||
self.task_kind
|
||||
}
|
||||
@@ -303,4 +368,51 @@ impl RequestContext {
|
||||
pub(crate) fn read_path_debug(&self) -> bool {
|
||||
self.read_path_debug
|
||||
}
|
||||
|
||||
pub(crate) fn perf_follows_from(&self, from: &RequestContext) {
|
||||
if let (Some(span), Some(from_span)) = (&self.perf_span, &from.perf_span) {
|
||||
span.inner().follows_from(from_span.inner());
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn maybe_instrument<'a, Fut, Fn>(
|
||||
&self,
|
||||
future: Fut,
|
||||
make_span: Fn,
|
||||
) -> BoxFuture<'a, Fut::Output>
|
||||
where
|
||||
Fut: Future + Send + 'a,
|
||||
Fn: FnOnce(&Span) -> Span,
|
||||
{
|
||||
match &self.perf_span {
|
||||
Some(perf_span) => {
|
||||
assert!(self.perf_span_dispatch.is_some());
|
||||
let dispatcher = self.perf_span_dispatch.as_ref().unwrap();
|
||||
|
||||
let new_span =
|
||||
tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner()));
|
||||
|
||||
let new_perf_span = PerfSpan::new(new_span, dispatcher.clone());
|
||||
future.instrument(new_perf_span).boxed()
|
||||
}
|
||||
None => future.boxed(),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn perf_span_record<
|
||||
Q: tracing::field::AsField + ?Sized,
|
||||
V: tracing::field::Value,
|
||||
>(
|
||||
&self,
|
||||
field: &Q,
|
||||
value: V,
|
||||
) {
|
||||
if let Some(span) = &self.perf_span {
|
||||
span.record(field, value);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn has_perf_span(&self) -> bool {
|
||||
self.perf_span.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -181,7 +181,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
|
||||
listen_pg_port: m.postgres_port,
|
||||
listen_http_addr: m.http_host,
|
||||
listen_http_port: m.http_port,
|
||||
listen_https_port: m.https_port,
|
||||
listen_https_port: None, // TODO: Support https.
|
||||
availability_zone_id: az_id.expect("Checked above"),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -37,8 +37,7 @@ use pageserver_api::models::{
|
||||
TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
|
||||
TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
|
||||
TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
|
||||
TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
|
||||
TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
|
||||
};
|
||||
use pageserver_api::shard::{ShardCount, TenantShardId};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
|
||||
@@ -64,14 +63,12 @@ use crate::tenant::mgr::{
|
||||
GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
|
||||
TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
|
||||
};
|
||||
use crate::tenant::remote_timeline_client::index::GcCompactionState;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_index_part, list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
use crate::tenant::secondary::SecondaryController;
|
||||
use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
|
||||
use crate::tenant::timeline::detach_ancestor::DetachBehavior;
|
||||
use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
|
||||
use crate::tenant::timeline::{
|
||||
CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout,
|
||||
@@ -861,75 +858,6 @@ async fn timeline_archival_config_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
|
||||
/// measure only.
|
||||
///
|
||||
/// Some examples of safe patches:
|
||||
/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
|
||||
/// - Force set the index part to use reldir v2 (migrating/migrated).
|
||||
///
|
||||
/// Some examples of unsafe patches:
|
||||
/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
|
||||
/// errors.
|
||||
/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
|
||||
async fn timeline_patch_index_part_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, None)?; // require global permission for this request
|
||||
let state = get_state(&request);
|
||||
|
||||
async {
|
||||
let timeline =
|
||||
active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
|
||||
.await?;
|
||||
|
||||
if let Some(rel_size_migration) = request_data.rel_size_migration {
|
||||
timeline
|
||||
.update_rel_size_v2_status(rel_size_migration)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
if let Some(gc_compaction_last_completed_lsn) =
|
||||
request_data.gc_compaction_last_completed_lsn
|
||||
{
|
||||
timeline
|
||||
.update_gc_compaction_state(GcCompactionState {
|
||||
last_completed_lsn: gc_compaction_last_completed_lsn,
|
||||
})
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
|
||||
{
|
||||
let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
|
||||
guard.store_and_unlock(applied_gc_cutoff_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
if request_data.force_index_update {
|
||||
timeline
|
||||
.remote_client
|
||||
.force_schedule_index_upload()
|
||||
.context("force schedule index upload")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
}
|
||||
|
||||
Ok::<_, ApiError>(())
|
||||
}
|
||||
.instrument(info_span!("timeline_patch_index_part",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
%timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn timeline_detail_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -2490,29 +2418,8 @@ async fn timeline_download_remote_layers_handler_get(
|
||||
json_response(StatusCode::OK, info)
|
||||
}
|
||||
|
||||
async fn timeline_detach_ancestor_handler_v1(
|
||||
async fn timeline_detach_ancestor_handler(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
timeline_detach_ancestor_handler_common(request, DetachBehavior::NoAncestorAndReparent, cancel)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn timeline_detach_ancestor_handler_v2(
|
||||
request: Request<Body>,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
timeline_detach_ancestor_handler_common(
|
||||
request,
|
||||
DetachBehavior::MultiLevelAndNoReparent,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn timeline_detach_ancestor_handler_common(
|
||||
request: Request<Body>,
|
||||
behavior: DetachBehavior,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
@@ -2570,7 +2477,7 @@ async fn timeline_detach_ancestor_handler_common(
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let progress = timeline
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, behavior, ctx)
|
||||
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
|
||||
.await?;
|
||||
|
||||
// uncomment to allow early as possible Tenant::drop
|
||||
@@ -2585,7 +2492,6 @@ async fn timeline_detach_ancestor_handler_common(
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
prepared,
|
||||
behavior,
|
||||
attempt,
|
||||
ctx,
|
||||
)
|
||||
@@ -2673,9 +2579,10 @@ async fn getpage_at_lsn_handler_inner(
|
||||
let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;
|
||||
|
||||
async {
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
|
||||
// Enable read path debugging
|
||||
let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
|
||||
let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.read_path_debug(true)
|
||||
.root();
|
||||
let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
|
||||
|
||||
// Use last_record_lsn if no lsn is provided
|
||||
@@ -3723,10 +3630,6 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
|
||||
|r| api_handler(r, get_timestamp_of_lsn_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
|
||||
|r| api_handler(r, timeline_patch_index_part_handler),
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
|
||||
|r| api_handler(r, lsn_lease_handler),
|
||||
@@ -3765,11 +3668,7 @@ pub fn make_router(
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor",
|
||||
|r| api_handler(r, timeline_detach_ancestor_handler_v1),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/detach_ancestor_v2",
|
||||
|r| api_handler(r, timeline_detach_ancestor_handler_v2),
|
||||
|r| api_handler(r, timeline_detach_ancestor_handler),
|
||||
)
|
||||
.delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
|
||||
api_handler(r, timeline_delete_handler)
|
||||
|
||||
@@ -64,7 +64,6 @@ pub struct CancellableTask {
|
||||
pub cancel: CancellationToken,
|
||||
}
|
||||
pub struct HttpEndpointListener(pub CancellableTask);
|
||||
pub struct HttpsEndpointListener(pub CancellableTask);
|
||||
pub struct ConsumptionMetricsTasks(pub CancellableTask);
|
||||
pub struct DiskUsageEvictionTask(pub CancellableTask);
|
||||
impl CancellableTask {
|
||||
@@ -78,7 +77,6 @@ impl CancellableTask {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn shutdown_pageserver(
|
||||
http_listener: HttpEndpointListener,
|
||||
https_listener: Option<HttpsEndpointListener>,
|
||||
page_service: page_service::Listener,
|
||||
consumption_metrics_worker: ConsumptionMetricsTasks,
|
||||
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
|
||||
@@ -215,15 +213,6 @@ pub async fn shutdown_pageserver(
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Some(https_listener) = https_listener {
|
||||
timed(
|
||||
https_listener.0.shutdown(),
|
||||
"shutdown https",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
// Shut down the HTTP endpoint last, so that you can still check the server's
|
||||
// status while it's shutting down.
|
||||
// FIXME: We should probably stop accepting commands like attach/detach earlier.
|
||||
|
||||
@@ -17,7 +17,7 @@ use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::config::{
|
||||
PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
|
||||
PageServiceProtocolPipelinedExecutionStrategy,
|
||||
PageServiceProtocolPipelinedExecutionStrategy, Tracing,
|
||||
};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::models::{
|
||||
@@ -36,6 +36,7 @@ use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::{BeMessage, FeMessage, FeStartupPacket, RowDescriptor};
|
||||
use rand::Rng;
|
||||
use strum_macros::IntoStaticStr;
|
||||
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, BufWriter};
|
||||
use tokio::task::JoinHandle;
|
||||
@@ -44,6 +45,7 @@ use tracing::*;
|
||||
use utils::auth::{Claims, Scope, SwappableJwtAuth};
|
||||
use utils::failpoint_support;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::logging::PERF_TRACE_TARGET;
|
||||
use utils::logging::log_slow;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::simple_rcu::RcuReadGuard;
|
||||
@@ -53,7 +55,7 @@ use utils::sync::spsc_fold;
|
||||
use crate::auth::check_permission;
|
||||
use crate::basebackup::BasebackupError;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::metrics::{
|
||||
self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
|
||||
};
|
||||
@@ -99,6 +101,7 @@ pub fn spawn(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
pg_auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
tcp_listener: tokio::net::TcpListener,
|
||||
) -> Listener {
|
||||
let cancel = CancellationToken::new();
|
||||
@@ -116,6 +119,7 @@ pub fn spawn(
|
||||
conf,
|
||||
tenant_manager,
|
||||
pg_auth,
|
||||
perf_trace_dispatch,
|
||||
tcp_listener,
|
||||
conf.pg_auth_type,
|
||||
conf.page_service_pipelining.clone(),
|
||||
@@ -172,6 +176,7 @@ pub async fn libpq_listener_main(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_manager: Arc<TenantManager>,
|
||||
auth: Option<Arc<SwappableJwtAuth>>,
|
||||
perf_trace_dispatch: Option<Dispatch>,
|
||||
listener: tokio::net::TcpListener,
|
||||
auth_type: AuthType,
|
||||
pipelining_config: PageServicePipeliningConfig,
|
||||
@@ -204,8 +209,12 @@ pub async fn libpq_listener_main(
|
||||
// Connection established. Spawn a new task to handle it.
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
let local_auth = auth.clone();
|
||||
let connection_ctx = listener_ctx
|
||||
.detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
|
||||
let connection_ctx = RequestContextBuilder::from(&listener_ctx)
|
||||
.task_kind(TaskKind::PageRequestHandler)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.perf_span_dispatch(perf_trace_dispatch.clone())
|
||||
.detached_child();
|
||||
|
||||
connection_handler_tasks.spawn(page_service_conn_main(
|
||||
conf,
|
||||
tenant_manager.clone(),
|
||||
@@ -600,6 +609,7 @@ impl std::fmt::Display for BatchedPageStreamError {
|
||||
struct BatchedGetPageRequest {
|
||||
req: PagestreamGetPageRequest,
|
||||
timer: SmgrOpTimer,
|
||||
ctx: RequestContext,
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -736,6 +746,7 @@ impl PageServerHandler {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
timeline_handles: &mut TimelineHandles,
|
||||
tracing_config: Option<&Tracing>,
|
||||
cancel: &CancellationToken,
|
||||
ctx: &RequestContext,
|
||||
protocol_version: PagestreamProtocolVersion,
|
||||
@@ -895,10 +906,55 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
let key = rel_block_to_key(req.rel, req.blkno);
|
||||
let shard = match timeline_handles
|
||||
.get(tenant_id, timeline_id, ShardSelector::Page(key))
|
||||
.await
|
||||
{
|
||||
|
||||
let sampled = match tracing_config {
|
||||
Some(conf) => {
|
||||
let ratio = &conf.sampling_ratio;
|
||||
|
||||
if ratio.numerator == 0 {
|
||||
false
|
||||
} else {
|
||||
rand::thread_rng().gen_range(0..ratio.denominator) < ratio.numerator
|
||||
}
|
||||
}
|
||||
None => false,
|
||||
};
|
||||
|
||||
let get_page_context = if sampled {
|
||||
RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_PAGE",
|
||||
tenant_id = %tenant_id,
|
||||
timeline_id = %timeline_id,
|
||||
lsn = %req.hdr.request_lsn,
|
||||
request_id = %req.hdr.reqid,
|
||||
key = %key)
|
||||
})
|
||||
.attached_child()
|
||||
} else {
|
||||
ctx.attached_child()
|
||||
};
|
||||
|
||||
let res = get_page_context
|
||||
.maybe_instrument(
|
||||
timeline_handles.get(tenant_id, timeline_id, ShardSelector::Page(key)),
|
||||
|current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"SHARD_SELECTION",
|
||||
tenant_id = %tenant_id,
|
||||
timeline_id = %timeline_id,
|
||||
lsn = %req.hdr.request_lsn,
|
||||
request_id = %req.hdr.reqid
|
||||
)
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let shard = match res {
|
||||
Ok(tl) => tl,
|
||||
Err(e) => {
|
||||
let span = mkspan!(before shard routing);
|
||||
@@ -927,24 +983,59 @@ impl PageServerHandler {
|
||||
};
|
||||
let span = mkspan!(shard.tenant_shard_id.shard_slug());
|
||||
|
||||
let timer = record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetPageAtLsn,
|
||||
received_at,
|
||||
)
|
||||
.await?;
|
||||
// TODO(vlad): why does this not show up?
|
||||
get_page_context.perf_span_record(
|
||||
"shard",
|
||||
tracing::field::display(shard.get_shard_identity().shard_slug()),
|
||||
);
|
||||
|
||||
let timer = get_page_context
|
||||
.maybe_instrument(
|
||||
record_op_start_and_throttle(
|
||||
&shard,
|
||||
metrics::SmgrQueryType::GetPageAtLsn,
|
||||
received_at,
|
||||
),
|
||||
|current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"THROTTLE",
|
||||
tenant_id = %tenant_id,
|
||||
timeline_id = %timeline_id,
|
||||
lsn = %req.hdr.request_lsn,
|
||||
request_id = %req.hdr.reqid
|
||||
)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
// We're holding the Handle
|
||||
let effective_request_lsn = match Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
&shard.get_applied_gc_cutoff_lsn(),
|
||||
ctx,
|
||||
)
|
||||
// TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
|
||||
.await
|
||||
{
|
||||
let res = get_page_context
|
||||
.maybe_instrument(
|
||||
Self::wait_or_get_last_lsn(
|
||||
&shard,
|
||||
req.hdr.request_lsn,
|
||||
req.hdr.not_modified_since,
|
||||
&shard.get_applied_gc_cutoff_lsn(),
|
||||
ctx,
|
||||
),
|
||||
|current_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: current_perf_span,
|
||||
"WAIT_LSN",
|
||||
tenant_id = %tenant_id,
|
||||
timeline_id = %timeline_id,
|
||||
lsn = %req.hdr.request_lsn,
|
||||
request_id = %req.hdr.reqid
|
||||
)
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
let effective_request_lsn = match res {
|
||||
Ok(lsn) => lsn,
|
||||
Err(e) => {
|
||||
return respond_error!(span, e);
|
||||
@@ -954,7 +1045,11 @@ impl PageServerHandler {
|
||||
span,
|
||||
shard: shard.downgrade(),
|
||||
effective_request_lsn,
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
|
||||
pages: smallvec::smallvec![BatchedGetPageRequest {
|
||||
req,
|
||||
timer,
|
||||
ctx: get_page_context
|
||||
}],
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "testing")]
|
||||
@@ -1486,12 +1581,15 @@ impl PageServerHandler {
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
|
||||
{
|
||||
let cancel = self.cancel.clone();
|
||||
let tracing_config = self.conf.tracing.clone();
|
||||
|
||||
let err = loop {
|
||||
let msg = Self::pagestream_read_message(
|
||||
&mut pgb_reader,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&mut timeline_handles,
|
||||
tracing_config.as_ref(),
|
||||
&cancel,
|
||||
ctx,
|
||||
protocol_version,
|
||||
@@ -1625,6 +1723,8 @@ impl PageServerHandler {
|
||||
// Batcher
|
||||
//
|
||||
|
||||
let tracing_config = self.conf.tracing.clone();
|
||||
|
||||
let cancel_batcher = self.cancel.child_token();
|
||||
let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
|
||||
let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
|
||||
@@ -1638,6 +1738,7 @@ impl PageServerHandler {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&mut timeline_handles,
|
||||
tracing_config.as_ref(),
|
||||
&cancel_batcher,
|
||||
&ctx,
|
||||
protocol_version,
|
||||
@@ -1976,7 +2077,9 @@ impl PageServerHandler {
|
||||
|
||||
let results = timeline
|
||||
.get_rel_page_at_lsn_batched(
|
||||
requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
|
||||
requests
|
||||
.iter()
|
||||
.map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
|
||||
effective_lsn,
|
||||
io_concurrency,
|
||||
ctx,
|
||||
|
||||
@@ -31,15 +31,16 @@ use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use strum::IntoEnumIterator;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{debug, info, trace, warn};
|
||||
use tracing::{debug, info, info_span, trace, warn};
|
||||
use utils::bin_ser::{BeSer, DeserializeError};
|
||||
use utils::logging::PERF_TRACE_TARGET;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::pausable_failpoint;
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
|
||||
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::RequestContext;
|
||||
use crate::context::{RequestContext, RequestContextBuilder};
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::metrics::{
|
||||
RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
|
||||
@@ -209,7 +210,9 @@ impl Timeline {
|
||||
let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
|
||||
let res = self
|
||||
.get_rel_page_at_lsn_batched(
|
||||
pages.iter().map(|(tag, blknum)| (tag, blknum)),
|
||||
pages
|
||||
.iter()
|
||||
.map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
|
||||
effective_lsn,
|
||||
io_concurrency.clone(),
|
||||
ctx,
|
||||
@@ -248,7 +251,7 @@ impl Timeline {
|
||||
/// The ordering of the returned vec corresponds to the ordering of `pages`.
|
||||
pub(crate) async fn get_rel_page_at_lsn_batched(
|
||||
&self,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
|
||||
pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
|
||||
effective_lsn: Lsn,
|
||||
io_concurrency: IoConcurrency,
|
||||
ctx: &RequestContext,
|
||||
@@ -262,8 +265,11 @@ impl Timeline {
|
||||
let mut result = Vec::with_capacity(pages.len());
|
||||
let result_slots = result.spare_capacity_mut();
|
||||
|
||||
let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
|
||||
for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
|
||||
let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
|
||||
BTreeMap::default();
|
||||
|
||||
let mut perf_instrument = false;
|
||||
for (response_slot_idx, (tag, blknum, req_ctx)) in pages.enumerate() {
|
||||
if tag.relnode == 0 {
|
||||
result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
|
||||
RelationError::InvalidRelnode.into(),
|
||||
@@ -273,6 +279,7 @@ impl Timeline {
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: perf span
|
||||
let nblocks = match self
|
||||
.get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
|
||||
.await
|
||||
@@ -297,8 +304,12 @@ impl Timeline {
|
||||
|
||||
let key = rel_block_to_key(*tag, *blknum);
|
||||
|
||||
if req_ctx.has_perf_span() {
|
||||
perf_instrument = true;
|
||||
}
|
||||
|
||||
let key_slots = keys_slots.entry(key).or_default();
|
||||
key_slots.push(response_slot_idx);
|
||||
key_slots.push((response_slot_idx, req_ctx));
|
||||
}
|
||||
|
||||
let keyspace = {
|
||||
@@ -314,16 +325,36 @@ impl Timeline {
|
||||
acc.to_keyspace()
|
||||
};
|
||||
|
||||
match self
|
||||
.get_vectored(keyspace, effective_lsn, io_concurrency, ctx)
|
||||
.await
|
||||
{
|
||||
let get_vectored_ctx = match perf_instrument {
|
||||
true => RequestContextBuilder::from(ctx)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"GET_VECTORED",
|
||||
tenant_id = %self.tenant_shard_id.tenant_id,
|
||||
timeline_id = %self.timeline_id,
|
||||
lsn = %effective_lsn,
|
||||
shard = %self.tenant_shard_id.shard_slug(),
|
||||
)
|
||||
})
|
||||
.attached_child(),
|
||||
false => ctx.attached_child(),
|
||||
};
|
||||
|
||||
let res = get_vectored_ctx
|
||||
.maybe_instrument(
|
||||
self.get_vectored(keyspace, effective_lsn, io_concurrency, &get_vectored_ctx),
|
||||
|current_perf_span| current_perf_span.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
match res {
|
||||
Ok(results) => {
|
||||
for (key, res) in results {
|
||||
let mut key_slots = keys_slots.remove(&key).unwrap().into_iter();
|
||||
let first_slot = key_slots.next().unwrap();
|
||||
let (first_slot, first_req_ctx) = key_slots.next().unwrap();
|
||||
|
||||
for slot in key_slots {
|
||||
for (slot, req_ctx) in key_slots {
|
||||
let clone = match &res {
|
||||
Ok(buf) => Ok(buf.clone()),
|
||||
Err(err) => Err(match err {
|
||||
@@ -341,17 +372,19 @@ impl Timeline {
|
||||
};
|
||||
|
||||
result_slots[slot].write(clone);
|
||||
req_ctx.perf_follows_from(&get_vectored_ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
|
||||
result_slots[first_slot].write(res);
|
||||
first_req_ctx.perf_follows_from(&get_vectored_ctx);
|
||||
slots_filled += 1;
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
// this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
|
||||
// (We enforce the max batch size outside of this function, in the code that constructs the batch request.)
|
||||
for slot in keys_slots.values().flatten() {
|
||||
for (slot, req_ctx) in keys_slots.values().flatten() {
|
||||
// this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
|
||||
// but without taking ownership of the GetVectoredError
|
||||
let err = match &err {
|
||||
@@ -383,6 +416,7 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
req_ctx.perf_follows_from(&get_vectored_ctx);
|
||||
result_slots[*slot].write(err);
|
||||
}
|
||||
|
||||
|
||||
@@ -217,9 +217,10 @@ pageserver_runtime!(COMPUTE_REQUEST_RUNTIME, "compute request worker");
|
||||
pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
|
||||
pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
|
||||
pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
|
||||
pageserver_runtime!(OTEL_RUNTIME, "open telemetry worker");
|
||||
// Bump this number when adding a new pageserver_runtime!
|
||||
// SAFETY: it's obviously correct
|
||||
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
|
||||
const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(5) };
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PageserverTaskId(u64);
|
||||
|
||||
@@ -5718,9 +5718,10 @@ pub(crate) mod harness {
|
||||
// enable it in case the tests exercise code paths that use
|
||||
// debug_assert_current_span_has_tenant_and_timeline_id
|
||||
logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)
|
||||
.expect("Failed to init test logging")
|
||||
.expect("Failed to init test logging");
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -300,8 +300,9 @@ impl TimelineMetadata {
|
||||
|
||||
/// Returns true if anything was changed
|
||||
pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
|
||||
// Detaching from ancestor now doesn't always detach directly to the direct ancestor, but we
|
||||
// ensure the LSN is the same. So we don't check the timeline ID.
|
||||
if let Some(ancestor) = self.body.ancestor_timeline {
|
||||
assert_eq!(ancestor, branchpoint.0);
|
||||
}
|
||||
if self.body.ancestor_lsn != Lsn(0) {
|
||||
assert_eq!(self.body.ancestor_lsn, branchpoint.1);
|
||||
}
|
||||
|
||||
@@ -1914,7 +1914,6 @@ impl TenantManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
prepared: PreparedTimelineDetach,
|
||||
behavior: detach_ancestor::DetachBehavior,
|
||||
mut attempt: detach_ancestor::Attempt,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<HashSet<TimelineId>, detach_ancestor::Error> {
|
||||
@@ -1958,14 +1957,7 @@ impl TenantManager {
|
||||
.map_err(Error::NotFound)?;
|
||||
|
||||
let resp = timeline
|
||||
.detach_from_ancestor_and_reparent(
|
||||
&tenant,
|
||||
prepared,
|
||||
attempt.ancestor_timeline_id,
|
||||
attempt.ancestor_lsn,
|
||||
behavior,
|
||||
ctx,
|
||||
)
|
||||
.detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
|
||||
.await?;
|
||||
|
||||
let mut slot_guard = slot_guard;
|
||||
|
||||
@@ -954,14 +954,6 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
|
||||
pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
self.schedule_index_upload(upload_queue);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background (internal function)
|
||||
fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
|
||||
let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
|
||||
|
||||
@@ -869,7 +869,8 @@ impl<'a> TenantDownloader<'a> {
|
||||
let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
|
||||
|
||||
let layers_in_heatmap = heatmap_timeline
|
||||
.hot_layers()
|
||||
.layers
|
||||
.iter()
|
||||
.map(|l| (&l.name, l.metadata.generation))
|
||||
.collect::<HashSet<_>>();
|
||||
let layers_on_disk = timeline_state
|
||||
@@ -1014,8 +1015,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
// Accumulate updates to the state
|
||||
let mut touched = Vec::new();
|
||||
|
||||
let timeline_id = timeline.timeline_id;
|
||||
for layer in timeline.into_hot_layers() {
|
||||
for layer in timeline.layers {
|
||||
if self.secondary_state.cancel.is_cancelled() {
|
||||
tracing::debug!("Cancelled -- dropping out of layer loop");
|
||||
return (Err(UpdateError::Cancelled), touched);
|
||||
@@ -1040,7 +1040,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
}
|
||||
|
||||
match self
|
||||
.download_layer(tenant_shard_id, &timeline_id, layer, ctx)
|
||||
.download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
|
||||
.await
|
||||
{
|
||||
Ok(Some(layer)) => touched.push(layer),
|
||||
@@ -1148,7 +1148,7 @@ impl<'a> TenantDownloader<'a> {
|
||||
let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
|
||||
let timeline_id = timeline.timeline_id;
|
||||
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
|
||||
tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
|
||||
|
||||
let (result, touched) = self
|
||||
.download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
|
||||
@@ -1316,11 +1316,11 @@ async fn init_timeline_state(
|
||||
// As we iterate through layers found on disk, we will look up their metadata from this map.
|
||||
// Layers not present in metadata will be discarded.
|
||||
let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
|
||||
heatmap.hot_layers().map(|l| (&l.name, l)).collect();
|
||||
heatmap.layers.iter().map(|l| (&l.name, l)).collect();
|
||||
|
||||
let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
|
||||
if let Some(last_heatmap) = last_heatmap {
|
||||
last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
|
||||
last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
|
||||
} else {
|
||||
HashMap::new()
|
||||
};
|
||||
|
||||
@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
|
||||
layers: Vec<HeatMapLayer>,
|
||||
pub(crate) layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -53,10 +53,8 @@ pub(crate) struct HeatMapLayer {
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(crate) access_time: SystemTime,
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
|
||||
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
|
||||
}
|
||||
|
||||
impl HeatMapLayer {
|
||||
@@ -64,13 +62,11 @@ impl HeatMapLayer {
|
||||
name: LayerName,
|
||||
metadata: LayerFileMetadata,
|
||||
access_time: SystemTime,
|
||||
cold: bool,
|
||||
) -> Self {
|
||||
Self {
|
||||
name,
|
||||
metadata,
|
||||
access_time,
|
||||
cold,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -82,18 +78,6 @@ impl HeatMapTimeline {
|
||||
layers,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
|
||||
self.layers.into_iter().filter(|l| !l.cold)
|
||||
}
|
||||
|
||||
pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
|
||||
self.layers.iter().filter(|l| !l.cold)
|
||||
}
|
||||
|
||||
pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
|
||||
self.layers.iter()
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct HeatMapStats {
|
||||
@@ -108,7 +92,7 @@ impl HeatMapTenant {
|
||||
layers: 0,
|
||||
};
|
||||
for timeline in &self.timelines {
|
||||
for layer in timeline.hot_layers() {
|
||||
for layer in &timeline.layers {
|
||||
stats.layers += 1;
|
||||
stats.bytes += layer.metadata.file_size;
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@ pub mod merge_iterator;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::future::Future;
|
||||
use std::ops::Range;
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
@@ -34,7 +33,8 @@ use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use tracing::{Instrument, trace};
|
||||
use tracing::{Instrument, info_span, trace};
|
||||
use utils::logging::PERF_TRACE_TARGET;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateGuard;
|
||||
|
||||
@@ -43,7 +43,7 @@ use super::PageReconstructError;
|
||||
use super::layer_map::InMemoryLayerDesc;
|
||||
use super::timeline::{GetVectoredError, ReadPath};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
@@ -874,13 +874,51 @@ impl ReadableLayer {
|
||||
) -> Result<(), GetVectoredError> {
|
||||
match self {
|
||||
ReadableLayer::PersistentLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
let persistent_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"PLAN_LAYER",
|
||||
layer = %layer
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
persistent_context
|
||||
.maybe_instrument(
|
||||
layer.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
&persistent_context,
|
||||
),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
ReadableLayer::InMemoryLayer(layer) => {
|
||||
layer
|
||||
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
|
||||
let in_mem_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"PLAN_LAYER",
|
||||
layer = %layer
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
in_mem_context
|
||||
.maybe_instrument(
|
||||
layer.get_values_reconstruct_data(
|
||||
keyspace,
|
||||
lsn_range,
|
||||
reconstruct_state,
|
||||
&in_mem_context,
|
||||
),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -896,9 +896,9 @@ impl DeltaLayerInner {
|
||||
where
|
||||
Reader: BlockReader + Clone,
|
||||
{
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build();
|
||||
.attached_child();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
@@ -1105,9 +1105,9 @@ impl DeltaLayerInner {
|
||||
all_keys.push(entry);
|
||||
true
|
||||
},
|
||||
&RequestContextBuilder::extend(ctx)
|
||||
&RequestContextBuilder::from(ctx)
|
||||
.page_content_kind(PageContentKind::DeltaLayerBtreeNode)
|
||||
.build(),
|
||||
.attached_child(),
|
||||
)
|
||||
.await?;
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
|
||||
@@ -481,9 +481,9 @@ impl ImageLayerInner {
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.page_content_kind(PageContentKind::ImageLayerBtreeNode)
|
||||
.build();
|
||||
.attached_child();
|
||||
|
||||
for range in keyspace.ranges.iter() {
|
||||
let mut range_end_handled = false;
|
||||
|
||||
@@ -420,9 +420,9 @@ impl InMemoryLayer {
|
||||
reconstruct_state: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.page_content_kind(PageContentKind::InMemoryLayer)
|
||||
.build();
|
||||
.attached_child();
|
||||
|
||||
let inner = self.inner.read().await;
|
||||
|
||||
|
||||
@@ -8,9 +8,10 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::HistoricLayerInfo;
|
||||
use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
|
||||
use tracing::Instrument;
|
||||
use tracing::{Instrument, info_span};
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TimelineId;
|
||||
use utils::logging::PERF_TRACE_TARGET;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::{gate, heavier_once_cell};
|
||||
|
||||
@@ -324,16 +325,29 @@ impl Layer {
|
||||
reconstruct_data: &mut ValuesReconstructState,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), GetVectoredError> {
|
||||
let downloaded =
|
||||
self.0
|
||||
.get_or_maybe_download(true, ctx)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
|
||||
GetVectoredError::Cancelled
|
||||
}
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
let get_layer_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"GET_LAYER",
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
let downloaded = get_layer_context
|
||||
.maybe_instrument(
|
||||
self.0.get_or_maybe_download(true, &get_layer_context),
|
||||
|crnt_perf_context| crnt_perf_context.clone(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
|
||||
GetVectoredError::Cancelled
|
||||
}
|
||||
other => GetVectoredError::Other(anyhow::anyhow!(other)),
|
||||
})?;
|
||||
|
||||
let this = ResidentLayer {
|
||||
downloaded: downloaded.clone(),
|
||||
owner: self.clone(),
|
||||
@@ -341,9 +355,29 @@ impl Layer {
|
||||
|
||||
self.record_access(ctx);
|
||||
|
||||
downloaded
|
||||
.get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
|
||||
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
|
||||
let visit_layer_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"VISIT_LAYER",
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
visit_layer_context
|
||||
.maybe_instrument(
|
||||
downloaded
|
||||
.get_values_reconstruct_data(
|
||||
this,
|
||||
keyspace,
|
||||
lsn_range,
|
||||
reconstruct_data,
|
||||
&visit_layer_context,
|
||||
)
|
||||
.instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self)),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await
|
||||
.map_err(|err| match err {
|
||||
GetVectoredError::Other(err) => GetVectoredError::Other(
|
||||
@@ -1045,15 +1079,36 @@ impl LayerInner {
|
||||
return Err(DownloadError::DownloadRequired);
|
||||
}
|
||||
|
||||
let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download);
|
||||
let download_ctx = if ctx.has_perf_span() {
|
||||
let dl_ctx = RequestContextBuilder::from(ctx)
|
||||
.task_kind(TaskKind::LayerDownload)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.root_perf_span(|| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
"DOWNLOAD_LAYER",
|
||||
layer = %self,
|
||||
reason = %reason
|
||||
)
|
||||
})
|
||||
.detached_child();
|
||||
ctx.perf_follows_from(&dl_ctx);
|
||||
dl_ctx
|
||||
} else {
|
||||
ctx.attached_child()
|
||||
};
|
||||
|
||||
async move {
|
||||
tracing::info!(%reason, "downloading on-demand");
|
||||
|
||||
let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
|
||||
let res = self
|
||||
.download_init_and_wait(timeline, permit, download_ctx)
|
||||
let res = download_ctx
|
||||
.maybe_instrument(
|
||||
self.download_init_and_wait(timeline, permit, download_ctx.attached_child()),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
scopeguard::ScopeGuard::into_inner(init_cancelled);
|
||||
Ok(res)
|
||||
}
|
||||
@@ -1563,10 +1618,10 @@ impl LayerInner {
|
||||
|
||||
self.access_stats.record_residence_event();
|
||||
|
||||
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
|
||||
|
||||
self.status.as_ref().unwrap().send_replace(Status::Evicted);
|
||||
|
||||
*self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1720,9 +1775,9 @@ impl DownloadedLayer {
|
||||
);
|
||||
|
||||
let res = if owner.desc.is_delta {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
|
||||
.build();
|
||||
.attached_child();
|
||||
let summary = Some(delta_layer::Summary::expected(
|
||||
owner.desc.tenant_shard_id.tenant_id,
|
||||
owner.desc.timeline_id,
|
||||
@@ -1738,9 +1793,9 @@ impl DownloadedLayer {
|
||||
.await
|
||||
.map(LayerKind::Delta)
|
||||
} else {
|
||||
let ctx = RequestContextBuilder::extend(ctx)
|
||||
let ctx = RequestContextBuilder::from(ctx)
|
||||
.page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
|
||||
.build();
|
||||
.attached_child();
|
||||
let lsn = owner.desc.image_layer_lsn();
|
||||
let summary = Some(image_layer::Summary::expected(
|
||||
owner.desc.tenant_shard_id.tenant_id,
|
||||
|
||||
@@ -644,9 +644,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
|
||||
.unwrap();
|
||||
|
||||
// This test does downloads
|
||||
let ctx = RequestContextBuilder::extend(&ctx)
|
||||
let ctx = RequestContextBuilder::from(&ctx)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.build();
|
||||
.attached_child();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
@@ -729,9 +729,9 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
.unwrap();
|
||||
|
||||
// This test does downloads
|
||||
let ctx = RequestContextBuilder::extend(&ctx)
|
||||
let ctx = RequestContextBuilder::from(&ctx)
|
||||
.download_behavior(DownloadBehavior::Download)
|
||||
.build();
|
||||
.attached_child();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
|
||||
@@ -67,6 +67,7 @@ use tracing::*;
|
||||
use utils::generation::Generation;
|
||||
use utils::guard_arc_swap::GuardArcSwap;
|
||||
use utils::id::TimelineId;
|
||||
use utils::logging::PERF_TRACE_TARGET;
|
||||
use utils::lsn::{AtomicLsn, Lsn, RecordLsn};
|
||||
use utils::postgres_client::PostgresClientProtocol;
|
||||
use utils::rate_limit::RateLimit;
|
||||
@@ -94,7 +95,7 @@ use super::{
|
||||
};
|
||||
use crate::aux_file::AuxFileSizeEstimator;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::l0_flush::{self, L0FlushGlobalState};
|
||||
@@ -1274,9 +1275,28 @@ impl Timeline {
|
||||
};
|
||||
reconstruct_state.read_path = read_path;
|
||||
|
||||
let traversal_res: Result<(), _> = self
|
||||
.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
|
||||
let plan_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"PLAN_IO",
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
let traversal_res: Result<(), _> = plan_context
|
||||
.maybe_instrument(
|
||||
self.get_vectored_reconstruct_data(
|
||||
keyspace.clone(),
|
||||
lsn,
|
||||
reconstruct_state,
|
||||
&plan_context,
|
||||
),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
if let Err(err) = traversal_res {
|
||||
// Wait for all the spawned IOs to complete.
|
||||
// See comments on `spawn_io` inside `storage_layer` for more details.
|
||||
@@ -1290,14 +1310,45 @@ impl Timeline {
|
||||
|
||||
let layers_visited = reconstruct_state.get_layers_visited();
|
||||
|
||||
let execute_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"RECONSTRUCT",
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
let futs = FuturesUnordered::new();
|
||||
for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
|
||||
futs.push({
|
||||
let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
|
||||
let execute_key_context = RequestContextBuilder::from(&execute_context)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"RECONSTRUCT_KEY",
|
||||
key = %key,
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
async move {
|
||||
assert_eq!(state.situation, ValueReconstructSituation::Complete);
|
||||
|
||||
let converted = match state.collect_pending_ios().await {
|
||||
let res = execute_key_context
|
||||
.maybe_instrument(state.collect_pending_ios(), |crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"EXECUTE_IO",
|
||||
)
|
||||
})
|
||||
.await;
|
||||
|
||||
let converted = match res {
|
||||
Ok(ok) => ok,
|
||||
Err(err) => {
|
||||
return (key, Err(err));
|
||||
@@ -1314,16 +1365,31 @@ impl Timeline {
|
||||
"{converted:?}"
|
||||
);
|
||||
|
||||
(
|
||||
key,
|
||||
walredo_self.reconstruct_value(key, lsn, converted).await,
|
||||
)
|
||||
let walredo_deltas = converted.num_deltas();
|
||||
let walredo_res = execute_key_context
|
||||
.maybe_instrument(
|
||||
walredo_self.reconstruct_value(key, lsn, converted),
|
||||
|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"WALREDO",
|
||||
deltas = %walredo_deltas,
|
||||
)
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
(key, walredo_res)
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let results = futs
|
||||
.collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
|
||||
let results = execute_context
|
||||
.maybe_instrument(
|
||||
futs.collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>(),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await;
|
||||
|
||||
// For aux file keys (v1 or v2) the vectored read path does not return an error
|
||||
@@ -3648,7 +3714,7 @@ impl Timeline {
|
||||
let visible_non_resident = match previous_heatmap.as_deref() {
|
||||
Some(PreviousHeatmap::Active {
|
||||
heatmap, read_at, ..
|
||||
}) => Some(heatmap.all_layers().filter_map(|hl| {
|
||||
}) => Some(heatmap.layers.iter().filter_map(|hl| {
|
||||
let desc: PersistentLayerDesc = hl.name.clone().into();
|
||||
let layer = guard.try_get_from_key(&desc.key())?;
|
||||
|
||||
@@ -3664,7 +3730,7 @@ impl Timeline {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
|
||||
Some((desc, hl.metadata.clone(), hl.access_time))
|
||||
})),
|
||||
Some(PreviousHeatmap::Obsolete) => None,
|
||||
None => None,
|
||||
@@ -3680,7 +3746,6 @@ impl Timeline {
|
||||
layer.layer_desc().clone(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
false, // these layers are not cold
|
||||
))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
@@ -3707,14 +3772,12 @@ impl Timeline {
|
||||
// Sort layers in order of which to download first. For a large set of layers to download, we
|
||||
// want to prioritize those layers which are most likely to still be in the resident many minutes
|
||||
// or hours later:
|
||||
// - Cold layers go last for convenience when a human inspects the heatmap.
|
||||
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
|
||||
// only exist for a few minutes before being compacted into L1s.
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime, cold)| {
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
std::cmp::Reverse((
|
||||
*cold,
|
||||
!LayerMap::is_l0(&desc.key_range, desc.is_delta),
|
||||
desc.lsn_range.end,
|
||||
))
|
||||
@@ -3722,9 +3785,7 @@ impl Timeline {
|
||||
|
||||
let layers = layers
|
||||
.into_iter()
|
||||
.map(|(desc, meta, atime, cold)| {
|
||||
HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
|
||||
})
|
||||
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
|
||||
.collect();
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
@@ -3744,7 +3805,6 @@ impl Timeline {
|
||||
name: vl.layer_desc().layer_name(),
|
||||
metadata: vl.metadata(),
|
||||
access_time: now,
|
||||
cold: true,
|
||||
};
|
||||
heatmap_layers.push(hl);
|
||||
}
|
||||
@@ -3801,18 +3861,34 @@ impl Timeline {
|
||||
return Err(GetVectoredError::Cancelled);
|
||||
}
|
||||
|
||||
let plan_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"PLAN_IO_TIMELINE",
|
||||
timeline = %timeline.timeline_id,
|
||||
lsn = %cont_lsn,
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
let TimelineVisitOutcome {
|
||||
completed_keyspace: completed,
|
||||
image_covered_keyspace,
|
||||
} = Self::get_vectored_reconstruct_data_timeline(
|
||||
timeline,
|
||||
keyspace.clone(),
|
||||
cont_lsn,
|
||||
reconstruct_state,
|
||||
&self.cancel,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} = plan_context
|
||||
.maybe_instrument(
|
||||
Self::get_vectored_reconstruct_data_timeline(
|
||||
timeline,
|
||||
keyspace.clone(),
|
||||
cont_lsn,
|
||||
reconstruct_state,
|
||||
&self.cancel,
|
||||
&plan_context,
|
||||
),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
keyspace.remove_overlapping_with(&completed);
|
||||
|
||||
@@ -3856,8 +3932,26 @@ impl Timeline {
|
||||
|
||||
// Take the min to avoid reconstructing a page with data newer than request Lsn.
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
timeline_owned = timeline
|
||||
.get_ready_ancestor_timeline(ancestor_timeline, ctx)
|
||||
|
||||
let get_ancestor_context = RequestContextBuilder::from(ctx)
|
||||
.perf_span(|crnt_perf_span| {
|
||||
info_span!(
|
||||
target: PERF_TRACE_TARGET,
|
||||
parent: crnt_perf_span,
|
||||
"GET_ANCESTOR",
|
||||
timeline = %timeline.timeline_id,
|
||||
lsn = %cont_lsn,
|
||||
ancestor = %ancestor_timeline.timeline_id,
|
||||
ancestor_lsn = %timeline.ancestor_lsn
|
||||
)
|
||||
})
|
||||
.attached_child();
|
||||
|
||||
timeline_owned = get_ancestor_context
|
||||
.maybe_instrument(
|
||||
timeline.get_ready_ancestor_timeline(ancestor_timeline, &get_ancestor_context),
|
||||
|crnt_perf_span| crnt_perf_span.clone(),
|
||||
)
|
||||
.await?;
|
||||
timeline = &*timeline_owned;
|
||||
};
|
||||
@@ -5388,10 +5482,9 @@ impl Timeline {
|
||||
self: &Arc<Timeline>,
|
||||
tenant: &crate::tenant::Tenant,
|
||||
options: detach_ancestor::Options,
|
||||
behavior: detach_ancestor::DetachBehavior,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
|
||||
detach_ancestor::prepare(self, tenant, behavior, options, ctx).await
|
||||
detach_ancestor::prepare(self, tenant, options, ctx).await
|
||||
}
|
||||
|
||||
/// Second step of detach from ancestor; detaches the `self` from it's current ancestor and
|
||||
@@ -5407,21 +5500,9 @@ impl Timeline {
|
||||
self: &Arc<Timeline>,
|
||||
tenant: &crate::tenant::Tenant,
|
||||
prepared: detach_ancestor::PreparedTimelineDetach,
|
||||
ancestor_timeline_id: TimelineId,
|
||||
ancestor_lsn: Lsn,
|
||||
behavior: detach_ancestor::DetachBehavior,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<detach_ancestor::DetachingAndReparenting, detach_ancestor::Error> {
|
||||
detach_ancestor::detach_and_reparent(
|
||||
self,
|
||||
tenant,
|
||||
prepared,
|
||||
ancestor_timeline_id,
|
||||
ancestor_lsn,
|
||||
behavior,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
|
||||
}
|
||||
|
||||
/// Final step which unblocks the GC.
|
||||
@@ -7059,7 +7140,6 @@ mod tests {
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use std::iter::Iterator;
|
||||
use tracing::Instrument;
|
||||
use utils::id::TimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -7073,8 +7153,8 @@ mod tests {
|
||||
use crate::tenant::{PreviousHeatmap, Timeline};
|
||||
|
||||
fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
|
||||
assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
|
||||
let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
|
||||
assert_eq!(lhs.layers.len(), rhs.layers.len());
|
||||
let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
|
||||
for (l, r) in lhs_rhs {
|
||||
assert_eq!(l.name, r.name);
|
||||
assert_eq!(l.metadata, r.metadata);
|
||||
@@ -7152,11 +7232,10 @@ mod tests {
|
||||
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
|
||||
|
||||
// L0 should come last
|
||||
let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
|
||||
assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());
|
||||
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
|
||||
|
||||
let mut last_lsn = Lsn::MAX;
|
||||
for layer in heatmap_layers {
|
||||
for layer in &heatmap.layers {
|
||||
// Covered layer should be omitted
|
||||
assert!(layer.name != covered_delta.layer_name());
|
||||
|
||||
@@ -7205,9 +7284,9 @@ mod tests {
|
||||
|
||||
eprintln!("Downloading {layer} and re-generating heatmap");
|
||||
|
||||
let ctx = &RequestContextBuilder::extend(&ctx)
|
||||
let ctx = &RequestContextBuilder::from(&ctx)
|
||||
.download_behavior(crate::context::DownloadBehavior::Download)
|
||||
.build();
|
||||
.attached_child();
|
||||
|
||||
let _resident = layer
|
||||
.download_and_keep_resident(ctx)
|
||||
@@ -7285,7 +7364,7 @@ mod tests {
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
// Both layers should be in the heatmap
|
||||
assert!(heatmap.all_layers().count() > 0);
|
||||
assert!(!heatmap.layers.is_empty());
|
||||
|
||||
// Now simulate a migration.
|
||||
timeline
|
||||
@@ -7311,7 +7390,7 @@ mod tests {
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
|
||||
assert!(post_eviction_heatmap.layers.is_empty());
|
||||
assert!(matches!(
|
||||
timeline.previous_heatmap.load().as_deref(),
|
||||
Some(PreviousHeatmap::Obsolete)
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
|
||||
use super::layer_manager::LayerManager;
|
||||
use super::{
|
||||
@@ -20,7 +19,6 @@ use anyhow::{Context, anyhow};
|
||||
use bytes::Bytes;
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
|
||||
@@ -444,7 +442,6 @@ impl GcCompactionQueue {
|
||||
));
|
||||
};
|
||||
let has_pending_tasks;
|
||||
let mut yield_for_l0 = false;
|
||||
let Some((id, item)) = ({
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
if let Some((id, item)) = guard.queued.pop_front() {
|
||||
@@ -494,23 +491,13 @@ impl GcCompactionQueue {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
|
||||
}
|
||||
let compaction_result =
|
||||
timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
self.notify_and_unblock(id);
|
||||
if compaction_result == CompactionOutcome::YieldForL0 {
|
||||
yield_for_l0 = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
GcCompactionQueueItem::SubCompactionJob(options) => {
|
||||
// TODO: error handling, clear the queue if any task fails?
|
||||
let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
if compaction_result == CompactionOutcome::YieldForL0 {
|
||||
// We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
|
||||
// again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
|
||||
// we need to clean things up before returning from the function.
|
||||
yield_for_l0 = true;
|
||||
}
|
||||
let _ = timeline.compact_with_options(cancel, options, ctx).await?;
|
||||
}
|
||||
GcCompactionQueueItem::Notify(id, l2_lsn) => {
|
||||
self.notify_and_unblock(id);
|
||||
@@ -539,10 +526,7 @@ impl GcCompactionQueue {
|
||||
let mut guard = self.inner.lock().unwrap();
|
||||
guard.running = None;
|
||||
}
|
||||
Ok(if yield_for_l0 {
|
||||
tracing::info!("give up gc-compaction: yield for L0 compaction");
|
||||
CompactionOutcome::YieldForL0
|
||||
} else if has_pending_tasks {
|
||||
Ok(if has_pending_tasks {
|
||||
CompactionOutcome::Pending
|
||||
} else {
|
||||
CompactionOutcome::Done
|
||||
@@ -740,41 +724,17 @@ struct CompactionStatisticsNumSize {
|
||||
|
||||
#[derive(Debug, Serialize, Default)]
|
||||
pub struct CompactionStatistics {
|
||||
/// Delta layer visited (maybe compressed, physical size)
|
||||
delta_layer_visited: CompactionStatisticsNumSize,
|
||||
/// Image layer visited (maybe compressed, physical size)
|
||||
image_layer_visited: CompactionStatisticsNumSize,
|
||||
/// Delta layer produced (maybe compressed, physical size)
|
||||
delta_layer_produced: CompactionStatisticsNumSize,
|
||||
/// Image layer produced (maybe compressed, physical size)
|
||||
image_layer_produced: CompactionStatisticsNumSize,
|
||||
/// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
|
||||
delta_layer_discarded: CompactionStatisticsNumSize,
|
||||
/// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
|
||||
image_layer_discarded: CompactionStatisticsNumSize,
|
||||
num_delta_layer_discarded: usize,
|
||||
num_image_layer_discarded: usize,
|
||||
num_unique_keys_visited: usize,
|
||||
/// Delta visited (uncompressed, original size)
|
||||
wal_keys_visited: CompactionStatisticsNumSize,
|
||||
/// Image visited (uncompressed, original size)
|
||||
image_keys_visited: CompactionStatisticsNumSize,
|
||||
/// Delta produced (uncompressed, original size)
|
||||
wal_produced: CompactionStatisticsNumSize,
|
||||
/// Image produced (uncompressed, original size)
|
||||
image_produced: CompactionStatisticsNumSize,
|
||||
|
||||
// Time spent in each phase
|
||||
time_acquire_lock_secs: f64,
|
||||
time_analyze_secs: f64,
|
||||
time_download_layer_secs: f64,
|
||||
time_main_loop_secs: f64,
|
||||
time_final_phase_secs: f64,
|
||||
time_total_secs: f64,
|
||||
|
||||
// Summary
|
||||
/// Ratio of the key-value size before/after gc-compaction.
|
||||
uncompressed_size_ratio: f64,
|
||||
/// Ratio of the physical size before/after gc-compaction.
|
||||
physical_size_ratio: f64,
|
||||
}
|
||||
|
||||
impl CompactionStatistics {
|
||||
@@ -824,13 +784,11 @@ impl CompactionStatistics {
|
||||
self.image_produced.num += 1;
|
||||
self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
|
||||
}
|
||||
fn discard_delta_layer(&mut self, original_size: u64) {
|
||||
self.delta_layer_discarded.num += 1;
|
||||
self.delta_layer_discarded.size += original_size;
|
||||
fn discard_delta_layer(&mut self) {
|
||||
self.num_delta_layer_discarded += 1;
|
||||
}
|
||||
fn discard_image_layer(&mut self, original_size: u64) {
|
||||
self.image_layer_discarded.num += 1;
|
||||
self.image_layer_discarded.size += original_size;
|
||||
fn discard_image_layer(&mut self) {
|
||||
self.num_image_layer_discarded += 1;
|
||||
}
|
||||
fn produce_delta_layer(&mut self, size: u64) {
|
||||
self.delta_layer_produced.num += 1;
|
||||
@@ -840,19 +798,6 @@ impl CompactionStatistics {
|
||||
self.image_layer_produced.num += 1;
|
||||
self.image_layer_produced.size += size;
|
||||
}
|
||||
fn finalize(&mut self) {
|
||||
let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
|
||||
let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
|
||||
self.uncompressed_size_ratio =
|
||||
original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
|
||||
let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
|
||||
let produced_physical_size = self.image_layer_produced.size
|
||||
+ self.delta_layer_produced.size
|
||||
+ self.image_layer_discarded.size
|
||||
+ self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
|
||||
self.physical_size_ratio =
|
||||
original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -983,9 +928,9 @@ impl Timeline {
|
||||
{
|
||||
Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
|
||||
// Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
|
||||
let image_ctx = RequestContextBuilder::extend(ctx)
|
||||
let image_ctx = RequestContextBuilder::from(ctx)
|
||||
.access_stats_behavior(AccessStatsBehavior::Skip)
|
||||
.build();
|
||||
.attached_child();
|
||||
|
||||
let mut partitioning = dense_partitioning;
|
||||
partitioning
|
||||
@@ -2613,7 +2558,7 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
options: CompactOptions,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
) -> Result<(), CompactionError> {
|
||||
let sub_compaction = options.sub_compaction;
|
||||
let job = GcCompactJob::from_compact_options(options.clone());
|
||||
if sub_compaction {
|
||||
@@ -2635,7 +2580,7 @@ impl Timeline {
|
||||
if jobs_len == 0 {
|
||||
info!("no jobs to run, skipping gc bottom-most compaction");
|
||||
}
|
||||
return Ok(CompactionOutcome::Done);
|
||||
return Ok(());
|
||||
}
|
||||
self.compact_with_gc_inner(cancel, job, ctx).await
|
||||
}
|
||||
@@ -2645,14 +2590,11 @@ impl Timeline {
|
||||
cancel: &CancellationToken,
|
||||
job: GcCompactJob,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<CompactionOutcome, CompactionError> {
|
||||
) -> Result<(), CompactionError> {
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
|
||||
let timer = Instant::now();
|
||||
let begin_timer = timer;
|
||||
|
||||
let gc_lock = async {
|
||||
tokio::select! {
|
||||
guard = self.gc_lock.lock() => Ok(guard),
|
||||
@@ -2660,9 +2602,6 @@ impl Timeline {
|
||||
}
|
||||
};
|
||||
|
||||
let time_acquire_lock = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
@@ -2714,7 +2653,7 @@ impl Timeline {
|
||||
tracing::warn!(
|
||||
"no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
|
||||
);
|
||||
return Ok(CompactionOutcome::Skipped);
|
||||
return Ok(());
|
||||
}
|
||||
real_gc_cutoff
|
||||
} else {
|
||||
@@ -2752,7 +2691,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
|
||||
gc_cutoff
|
||||
);
|
||||
return Ok(CompactionOutcome::Done);
|
||||
return Ok(());
|
||||
};
|
||||
// Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
|
||||
// the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
|
||||
@@ -2773,7 +2712,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
|
||||
compact_lsn_range.end
|
||||
);
|
||||
return Ok(CompactionOutcome::Done);
|
||||
return Ok(());
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
// layers to compact.
|
||||
@@ -2799,7 +2738,7 @@ impl Timeline {
|
||||
"no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
|
||||
gc_cutoff, compact_key_range.start, compact_key_range.end
|
||||
);
|
||||
return Ok(CompactionOutcome::Done);
|
||||
return Ok(());
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
GcCompactionJobDescription {
|
||||
@@ -2852,9 +2791,6 @@ impl Timeline {
|
||||
has_data_below,
|
||||
);
|
||||
|
||||
let time_analyze = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
for layer in &job_desc.selected_layers {
|
||||
debug!("read layer: {}", layer.layer_desc().key());
|
||||
}
|
||||
@@ -2914,15 +2850,6 @@ impl Timeline {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
let resident_layer = layer
|
||||
.download_and_keep_resident(ctx)
|
||||
.await
|
||||
@@ -2966,9 +2893,6 @@ impl Timeline {
|
||||
.context("failed to create filter iterator")
|
||||
.map_err(CompactionError::Other)?;
|
||||
|
||||
let time_download_layer = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
// Step 2: Produce images+deltas.
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
@@ -3043,8 +2967,6 @@ impl Timeline {
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
let mut keys_processed = 0;
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter
|
||||
.next_with_trace()
|
||||
.await
|
||||
@@ -3054,18 +2976,6 @@ impl Timeline {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(CompactionError::ShuttingDown);
|
||||
}
|
||||
keys_processed += 1;
|
||||
if keys_processed % 1000 == 0 {
|
||||
let should_yield = self
|
||||
.l0_compaction_trigger
|
||||
.notified()
|
||||
.now_or_never()
|
||||
.is_some();
|
||||
if should_yield {
|
||||
tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
|
||||
return Ok(CompactionOutcome::YieldForL0);
|
||||
}
|
||||
}
|
||||
if self.shard_identity.is_key_disposable(&key) {
|
||||
// If this shard does not need to store this key, simply skip it.
|
||||
//
|
||||
@@ -3204,9 +3114,6 @@ impl Timeline {
|
||||
.map_err(CompactionError::Other)?;
|
||||
// end: move the above part to the loop body
|
||||
|
||||
let time_main_loop = timer.elapsed();
|
||||
let timer = Instant::now();
|
||||
|
||||
let mut rewrote_delta_layers = Vec::new();
|
||||
for (key, writers) in delta_layer_rewriters {
|
||||
if let Some(delta_writer_before) = writers.before {
|
||||
@@ -3271,13 +3178,6 @@ impl Timeline {
|
||||
let mut keep_layers = HashSet::new();
|
||||
let produced_delta_layers_len = produced_delta_layers.len();
|
||||
let produced_image_layers_len = produced_image_layers.len();
|
||||
|
||||
let layer_selection_by_key = job_desc
|
||||
.selected_layers
|
||||
.iter()
|
||||
.map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
for action in produced_delta_layers {
|
||||
match action {
|
||||
BatchWriterResult::Produced(layer) => {
|
||||
@@ -3291,16 +3191,8 @@ impl Timeline {
|
||||
if cfg!(debug_assertions) {
|
||||
info!("discarded delta layer: {}", l);
|
||||
}
|
||||
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
|
||||
stat.discard_delta_layer(layer_desc.file_size());
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
|
||||
l
|
||||
);
|
||||
stat.discard_delta_layer(0);
|
||||
}
|
||||
keep_layers.insert(l);
|
||||
stat.discard_delta_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3309,9 +3201,6 @@ impl Timeline {
|
||||
"produced rewritten delta layer: {}",
|
||||
layer.layer_desc().key()
|
||||
);
|
||||
// For now, we include rewritten delta layer size in the "produce_delta_layer". We could
|
||||
// make it a separate statistics in the future.
|
||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||
}
|
||||
compact_to.extend(rewrote_delta_layers);
|
||||
for action in produced_image_layers {
|
||||
@@ -3323,16 +3212,8 @@ impl Timeline {
|
||||
}
|
||||
BatchWriterResult::Discarded(l) => {
|
||||
debug!("discarded image layer: {}", l);
|
||||
if let Some(layer_desc) = layer_selection_by_key.get(&l) {
|
||||
stat.discard_image_layer(layer_desc.file_size());
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
|
||||
l
|
||||
);
|
||||
stat.discard_image_layer(0);
|
||||
}
|
||||
keep_layers.insert(l);
|
||||
stat.discard_image_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3380,16 +3261,6 @@ impl Timeline {
|
||||
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
|
||||
let time_final_phase = timer.elapsed();
|
||||
|
||||
stat.time_final_phase_secs = time_final_phase.as_secs_f64();
|
||||
stat.time_main_loop_secs = time_main_loop.as_secs_f64();
|
||||
stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
|
||||
stat.time_download_layer_secs = time_download_layer.as_secs_f64();
|
||||
stat.time_analyze_secs = time_analyze.as_secs_f64();
|
||||
stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
|
||||
stat.finalize();
|
||||
|
||||
info!(
|
||||
"gc-compaction statistics: {}",
|
||||
serde_json::to_string(&stat)
|
||||
@@ -3398,7 +3269,7 @@ impl Timeline {
|
||||
);
|
||||
|
||||
if dry_run {
|
||||
return Ok(CompactionOutcome::Done);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(
|
||||
@@ -3517,7 +3388,7 @@ impl Timeline {
|
||||
|
||||
drop(gc_lock);
|
||||
|
||||
Ok(CompactionOutcome::Done)
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -32,9 +32,6 @@ pub(crate) enum Error {
|
||||
#[error("too many ancestors")]
|
||||
TooManyAncestors,
|
||||
|
||||
#[error("ancestor is not empty")]
|
||||
AncestorNotEmpty,
|
||||
|
||||
#[error("shutting down, please retry later")]
|
||||
ShuttingDown,
|
||||
|
||||
@@ -92,9 +89,7 @@ impl From<Error> for ApiError {
|
||||
fn from(value: Error) -> Self {
|
||||
match value {
|
||||
Error::NoAncestor => ApiError::Conflict(value.to_string()),
|
||||
Error::TooManyAncestors | Error::AncestorNotEmpty => {
|
||||
ApiError::BadRequest(anyhow::anyhow!("{value}"))
|
||||
}
|
||||
Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")),
|
||||
Error::ShuttingDown => ApiError::ShuttingDown,
|
||||
Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")),
|
||||
Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
|
||||
@@ -132,22 +127,13 @@ pub(crate) struct PreparedTimelineDetach {
|
||||
layers: Vec<Layer>,
|
||||
}
|
||||
|
||||
// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments.
|
||||
/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Options {
|
||||
pub(crate) rewrite_concurrency: std::num::NonZeroUsize,
|
||||
pub(crate) copy_concurrency: std::num::NonZeroUsize,
|
||||
}
|
||||
|
||||
/// Controls the detach ancestor behavior.
|
||||
/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point.
|
||||
/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum DetachBehavior {
|
||||
NoAncestorAndReparent,
|
||||
MultiLevelAndNoReparent,
|
||||
}
|
||||
|
||||
impl Default for Options {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
@@ -161,8 +147,7 @@ impl Default for Options {
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Attempt {
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
pub(crate) ancestor_timeline_id: TimelineId,
|
||||
pub(crate) ancestor_lsn: Lsn,
|
||||
|
||||
_guard: completion::Completion,
|
||||
gate_entered: Option<utils::sync::gate::GateGuard>,
|
||||
}
|
||||
@@ -182,30 +167,25 @@ impl Attempt {
|
||||
pub(super) async fn prepare(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
behavior: DetachBehavior,
|
||||
options: Options,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<Progress, Error> {
|
||||
use Error::*;
|
||||
|
||||
let Some((mut ancestor, mut ancestor_lsn)) = detached
|
||||
let Some((ancestor, ancestor_lsn)) = detached
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|tl| (tl.clone(), detached.ancestor_lsn))
|
||||
else {
|
||||
let ancestor_id;
|
||||
let ancestor_lsn;
|
||||
let still_in_progress = {
|
||||
let accessor = detached.remote_client.initialized_upload_queue()?;
|
||||
|
||||
// we are safe to inspect the latest uploaded, because we can only witness this after
|
||||
// restart is complete and ancestor is no more.
|
||||
let latest = accessor.latest_uploaded_index_part();
|
||||
let Some((id, lsn)) = latest.lineage.detached_previous_ancestor() else {
|
||||
if latest.lineage.detached_previous_ancestor().is_none() {
|
||||
return Err(NoAncestor);
|
||||
};
|
||||
ancestor_id = id;
|
||||
ancestor_lsn = lsn;
|
||||
|
||||
latest
|
||||
.gc_blocking
|
||||
@@ -216,8 +196,7 @@ pub(super) async fn prepare(
|
||||
if still_in_progress {
|
||||
// gc is still blocked, we can still reparent and complete.
|
||||
// we are safe to reparent remaining, because they were locked in in the beginning.
|
||||
let attempt =
|
||||
continue_with_blocked_gc(detached, tenant, ancestor_id, ancestor_lsn).await?;
|
||||
let attempt = continue_with_blocked_gc(detached, tenant).await?;
|
||||
|
||||
// because the ancestor of detached is already set to none, we have published all
|
||||
// of the layers, so we are still "prepared."
|
||||
@@ -245,34 +224,13 @@ pub(super) async fn prepare(
|
||||
|
||||
check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
|
||||
|
||||
if let DetachBehavior::MultiLevelAndNoReparent = behavior {
|
||||
// If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline.
|
||||
while let Some(ancestor_of_ancestor) = ancestor.ancestor_timeline.clone() {
|
||||
if ancestor_lsn != ancestor.ancestor_lsn {
|
||||
// non-technical requirement; we could flatten still if ancestor LSN does not match but that needs
|
||||
// us to copy and cut more layers.
|
||||
return Err(AncestorNotEmpty);
|
||||
}
|
||||
// Use the ancestor of the ancestor as the new ancestor (only when the ancestor LSNs are the same)
|
||||
ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable
|
||||
ancestor = ancestor_of_ancestor;
|
||||
// TODO: do we still need to check if we don't want to reparent?
|
||||
check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
|
||||
}
|
||||
} else if ancestor.ancestor_timeline.is_some() {
|
||||
if ancestor.ancestor_timeline.is_some() {
|
||||
// non-technical requirement; we could flatten N ancestors just as easily but we chose
|
||||
// not to, at least initially
|
||||
return Err(TooManyAncestors);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"attempt to detach the timeline from the ancestor: {}@{}, behavior={:?}",
|
||||
ancestor.timeline_id,
|
||||
ancestor_lsn,
|
||||
behavior
|
||||
);
|
||||
|
||||
let attempt = start_new_attempt(detached, tenant, ancestor.timeline_id, ancestor_lsn).await?;
|
||||
let attempt = start_new_attempt(detached, tenant).await?;
|
||||
|
||||
utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable");
|
||||
|
||||
@@ -492,13 +450,8 @@ pub(super) async fn prepare(
|
||||
Ok(Progress::Prepared(attempt, prepared))
|
||||
}
|
||||
|
||||
async fn start_new_attempt(
|
||||
detached: &Timeline,
|
||||
tenant: &Tenant,
|
||||
ancestor_timeline_id: TimelineId,
|
||||
ancestor_lsn: Lsn,
|
||||
) -> Result<Attempt, Error> {
|
||||
let attempt = obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn)?;
|
||||
async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
|
||||
let attempt = obtain_exclusive_attempt(detached, tenant)?;
|
||||
|
||||
// insert the block in the index_part.json, if not already there.
|
||||
let _dont_care = tenant
|
||||
@@ -513,23 +466,13 @@ async fn start_new_attempt(
|
||||
Ok(attempt)
|
||||
}
|
||||
|
||||
async fn continue_with_blocked_gc(
|
||||
detached: &Timeline,
|
||||
tenant: &Tenant,
|
||||
ancestor_timeline_id: TimelineId,
|
||||
ancestor_lsn: Lsn,
|
||||
) -> Result<Attempt, Error> {
|
||||
async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
|
||||
// FIXME: it would be nice to confirm that there is an in-memory version, since we've just
|
||||
// verified there is a persistent one?
|
||||
obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn)
|
||||
obtain_exclusive_attempt(detached, tenant)
|
||||
}
|
||||
|
||||
fn obtain_exclusive_attempt(
|
||||
detached: &Timeline,
|
||||
tenant: &Tenant,
|
||||
ancestor_timeline_id: TimelineId,
|
||||
ancestor_lsn: Lsn,
|
||||
) -> Result<Attempt, Error> {
|
||||
fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
|
||||
use Error::{OtherTimelineDetachOngoing, ShuttingDown};
|
||||
|
||||
// ensure we are the only active attempt for this tenant
|
||||
@@ -550,8 +493,6 @@ fn obtain_exclusive_attempt(
|
||||
|
||||
Ok(Attempt {
|
||||
timeline_id: detached.timeline_id,
|
||||
ancestor_timeline_id,
|
||||
ancestor_lsn,
|
||||
_guard: guard,
|
||||
gate_entered: Some(_gate_entered),
|
||||
})
|
||||
@@ -854,9 +795,6 @@ pub(super) async fn detach_and_reparent(
|
||||
detached: &Arc<Timeline>,
|
||||
tenant: &Tenant,
|
||||
prepared: PreparedTimelineDetach,
|
||||
ancestor_timeline_id: TimelineId,
|
||||
ancestor_lsn: Lsn,
|
||||
behavior: DetachBehavior,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<DetachingAndReparenting, Error> {
|
||||
let PreparedTimelineDetach { layers } = prepared;
|
||||
@@ -884,30 +822,7 @@ pub(super) async fn detach_and_reparent(
|
||||
"cannot (detach? reparent)? complete if the operation is not still ongoing"
|
||||
);
|
||||
|
||||
let ancestor_to_detach = match detached.ancestor_timeline.as_ref() {
|
||||
Some(mut ancestor) => {
|
||||
while ancestor.timeline_id != ancestor_timeline_id {
|
||||
match ancestor.ancestor_timeline.as_ref() {
|
||||
Some(found) => {
|
||||
if ancestor_lsn != ancestor.ancestor_lsn {
|
||||
return Err(Error::DetachReparent(anyhow::anyhow!(
|
||||
"cannot find the ancestor timeline to detach from: wrong ancestor lsn"
|
||||
)));
|
||||
}
|
||||
ancestor = found;
|
||||
}
|
||||
None => {
|
||||
return Err(Error::DetachReparent(anyhow::anyhow!(
|
||||
"cannot find the ancestor timeline to detach from"
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(ancestor)
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
let ancestor = match (ancestor_to_detach, recorded_branchpoint) {
|
||||
let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) {
|
||||
(Some(ancestor), None) => {
|
||||
assert!(
|
||||
!layers.is_empty(),
|
||||
@@ -980,11 +895,6 @@ pub(super) async fn detach_and_reparent(
|
||||
Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false),
|
||||
};
|
||||
|
||||
if let DetachBehavior::MultiLevelAndNoReparent = behavior {
|
||||
// Do not reparent if the user requests to behave so.
|
||||
return Ok(DetachingAndReparenting::Reparented(HashSet::new()));
|
||||
}
|
||||
|
||||
let mut tasks = tokio::task::JoinSet::new();
|
||||
|
||||
// Returns a single permit semaphore which will be used to make one reparenting succeed,
|
||||
@@ -1122,11 +1032,6 @@ pub(super) async fn complete(
|
||||
}
|
||||
|
||||
/// Query against a locked `Tenant::timelines`.
|
||||
///
|
||||
/// A timeline is reparentable if:
|
||||
///
|
||||
/// - It is not the timeline being detached.
|
||||
/// - It has the same ancestor as the timeline being detached. Note that the ancestor might not be the direct ancestor.
|
||||
fn reparentable_timelines<'a, I>(
|
||||
timelines: I,
|
||||
detached: &'a Arc<Timeline>,
|
||||
|
||||
@@ -61,11 +61,11 @@ impl HeatmapLayersDownloader {
|
||||
|
||||
tracing::info!(
|
||||
resident_size=%timeline.resident_physical_size(),
|
||||
heatmap_layers=%heatmap.all_layers().count(),
|
||||
heatmap_layers=%heatmap.layers.len(),
|
||||
"Starting heatmap layers download"
|
||||
);
|
||||
|
||||
let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
|
||||
let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
|
||||
|layer| {
|
||||
let ctx = ctx.attached_child();
|
||||
let tl = timeline.clone();
|
||||
|
||||
@@ -1195,11 +1195,9 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
state = GET_STATE(entry, chunk_offs + i);
|
||||
if (state == PENDING) {
|
||||
SET_STATE(entry, chunk_offs + i, REQUESTED);
|
||||
} else if (state == UNAVAILABLE) {
|
||||
} else if (state != REQUESTED) {
|
||||
SET_STATE(entry, chunk_offs + i, PENDING);
|
||||
break;
|
||||
} else if (state == AVAILABLE) {
|
||||
break;
|
||||
}
|
||||
if (!sleeping)
|
||||
{
|
||||
|
||||
@@ -16,8 +16,6 @@
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "libpq-int.h"
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "fmgr.h"
|
||||
@@ -817,10 +815,9 @@ retry:
|
||||
get_socket_stats(PQsocket(pageserver_conn), &sndbuf, &recvbuf);
|
||||
|
||||
neon_shard_log(shard_no, LOG,
|
||||
"no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d) (conn start=%d end=%d)",
|
||||
"no response received from pageserver for %0.3f s, still waiting (sent " UINT64_FORMAT " requests, received " UINT64_FORMAT " responses) (socket sndbuf=%d recvbuf=%d)",
|
||||
INSTR_TIME_GET_DOUBLE(since_start),
|
||||
shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf,
|
||||
pageserver_conn->inStart, pageserver_conn->inEnd);
|
||||
shard->nrequests_sent, shard->nresponses_received, sndbuf, recvbuf);
|
||||
shard->receive_last_log_time = now;
|
||||
shard->receive_logged = true;
|
||||
}
|
||||
|
||||
@@ -457,15 +457,6 @@ _PG_init(void)
|
||||
PGC_SIGHUP,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
DefineCustomBoolVariable(
|
||||
"neon.disable_wal_prevlink_checks",
|
||||
"Disable validation of prev link in WAL records",
|
||||
NULL,
|
||||
&disable_wal_prev_lsn_checks,
|
||||
false,
|
||||
PGC_SIGHUP,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable(
|
||||
"neon.allow_replica_misconfig",
|
||||
|
||||
@@ -23,7 +23,6 @@ extern char *wal_acceptors_list;
|
||||
extern int wal_acceptor_reconnect_timeout;
|
||||
extern int wal_acceptor_connection_timeout;
|
||||
extern int readahead_getpage_pull_timeout_ms;
|
||||
extern bool disable_wal_prev_lsn_checks;
|
||||
|
||||
#if PG_MAJORVERSION_NUM >= 17
|
||||
extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE;
|
||||
|
||||
@@ -32,8 +32,6 @@ extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc);
|
||||
extern bool GetDonorShmem(XLogRecPtr *donor_lsn);
|
||||
extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
|
||||
|
||||
bool disable_wal_prev_lsn_checks = false;
|
||||
|
||||
static XLogRecPtr
|
||||
NeonWALReadWaitForWAL(XLogRecPtr loc)
|
||||
{
|
||||
@@ -84,8 +82,6 @@ NeonWALPageRead(
|
||||
if (flushptr < targetPagePtr + reqLen)
|
||||
return -1;
|
||||
|
||||
xlogreader->skip_lsn_checks = disable_wal_prev_lsn_checks;
|
||||
|
||||
/* Read at most XLOG_BLCKSZ bytes */
|
||||
if (targetPagePtr + XLOG_BLCKSZ <= flushptr)
|
||||
count = XLOG_BLCKSZ;
|
||||
|
||||
@@ -287,10 +287,9 @@ impl ConnCfg {
|
||||
// TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
|
||||
info!(
|
||||
cold_start_info = ctx.cold_start_info().as_str(),
|
||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}, query_id={}",
|
||||
"connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}",
|
||||
self.0.get_ssl_mode(),
|
||||
ctx.get_proxy_latency(),
|
||||
ctx.get_testodrome_id(),
|
||||
);
|
||||
|
||||
// NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
|
||||
|
||||
@@ -63,7 +63,6 @@ struct RequestContextInner {
|
||||
success: bool,
|
||||
pub(crate) cold_start_info: ColdStartInfo,
|
||||
pg_options: Option<StartupMessageParams>,
|
||||
testodrome_query_id: Option<String>,
|
||||
|
||||
// extra
|
||||
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
||||
@@ -111,7 +110,6 @@ impl Clone for RequestContext {
|
||||
rejected: inner.rejected,
|
||||
cold_start_info: inner.cold_start_info,
|
||||
pg_options: inner.pg_options.clone(),
|
||||
testodrome_query_id: inner.testodrome_query_id.clone(),
|
||||
|
||||
sender: None,
|
||||
disconnect_sender: None,
|
||||
@@ -162,7 +160,6 @@ impl RequestContext {
|
||||
rejected: None,
|
||||
cold_start_info: ColdStartInfo::Unknown,
|
||||
pg_options: None,
|
||||
testodrome_query_id: None,
|
||||
|
||||
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
|
||||
disconnect_sender: LOG_CHAN_DISCONNECT.get().and_then(|tx| tx.upgrade()),
|
||||
@@ -213,19 +210,6 @@ impl RequestContext {
|
||||
this.set_dbname(dbname.into());
|
||||
}
|
||||
|
||||
// Try to get testodrome_query_id directly from parameters
|
||||
if let Some(options_str) = options.get("options") {
|
||||
// If not found directly, try to extract it from the options string
|
||||
for option in options_str.split_whitespace() {
|
||||
if option.starts_with("neon_query_id:") {
|
||||
if let Some(value) = option.strip_prefix("neon_query_id:") {
|
||||
this.set_testodrome_id(value.to_string());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.pg_options = Some(options);
|
||||
}
|
||||
|
||||
@@ -371,15 +355,6 @@ impl RequestContext {
|
||||
.accumulated()
|
||||
}
|
||||
|
||||
pub(crate) fn get_testodrome_id(&self) -> String {
|
||||
self.0
|
||||
.try_lock()
|
||||
.expect("should not deadlock")
|
||||
.testodrome_query_id
|
||||
.clone()
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
pub(crate) fn success(&self) {
|
||||
self.0
|
||||
.try_lock()
|
||||
@@ -441,10 +416,6 @@ impl RequestContextInner {
|
||||
self.user = Some(user);
|
||||
}
|
||||
|
||||
fn set_testodrome_id(&mut self, query_id: String) {
|
||||
self.testodrome_query_id = Some(query_id);
|
||||
}
|
||||
|
||||
fn has_private_peer_addr(&self) -> bool {
|
||||
match self.conn_info.addr.ip() {
|
||||
IpAddr::V4(ip) => ip.is_private(),
|
||||
|
||||
@@ -46,7 +46,8 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
|
||||
.expect("this should be a valid filter directive"),
|
||||
);
|
||||
|
||||
let otlp_layer = tracing_utils::init_tracing("proxy").await;
|
||||
let otlp_layer =
|
||||
tracing_utils::init_tracing("proxy", tracing_utils::ExportConfig::default()).await;
|
||||
|
||||
let json_log_layer = if logfmt == LogFormat::Json {
|
||||
Some(JsonLoggingLayer::new(
|
||||
|
||||
@@ -406,11 +406,8 @@ impl std::fmt::Display for LatencyAccumulated {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"client: {}, cplane: {}, compute: {}, retry: {}",
|
||||
self.client.as_micros(),
|
||||
self.cplane.as_micros(),
|
||||
self.compute.as_micros(),
|
||||
self.retry.as_micros()
|
||||
"client: {:?}, cplane: {:?}, compute: {:?}, retry: {:?}",
|
||||
self.client, self.cplane, self.compute, self.retry
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,10 +37,6 @@ pub enum Error {
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
|
||||
/// Failed to create client.
|
||||
#[error("create client: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
|
||||
CreateClient(reqwest::Error),
|
||||
}
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
@@ -68,7 +64,11 @@ impl ResponseErrorMessageExt for reqwest::Response {
|
||||
}
|
||||
|
||||
impl Client {
|
||||
pub fn new(
|
||||
pub fn new(mgmt_api_endpoint: String, jwt: Option<SecretString>) -> Self {
|
||||
Self::from_client(reqwest::Client::new(), mgmt_api_endpoint, jwt)
|
||||
}
|
||||
|
||||
pub fn from_client(
|
||||
client: reqwest::Client,
|
||||
mgmt_api_endpoint: String,
|
||||
jwt: Option<SecretString>,
|
||||
@@ -172,10 +172,12 @@ impl Client {
|
||||
uri: U,
|
||||
body: B,
|
||||
) -> Result<reqwest::Response> {
|
||||
let mut req = self.client.request(method, uri);
|
||||
if let Some(value) = &self.authorization_header {
|
||||
req = req.header(reqwest::header::AUTHORIZATION, value.get_contents())
|
||||
}
|
||||
let req = self.client.request(method, uri);
|
||||
let req = if let Some(value) = &self.authorization_header {
|
||||
req.header(reqwest::header::AUTHORIZATION, value.get_contents())
|
||||
} else {
|
||||
req
|
||||
};
|
||||
req.json(&body).send().await.map_err(Error::ReceiveBody)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -255,6 +255,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
logging::init(
|
||||
LogFormat::from_config(&args.log_format)?,
|
||||
logging::TracingErrorLayerEnablement::Disabled,
|
||||
utils::logging::OtelEnablement::Disabled,
|
||||
logging::Output::Stdout,
|
||||
)?;
|
||||
logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
|
||||
@@ -402,16 +402,12 @@ pub async fn handle_request(
|
||||
bail!("Timeline {} already exists", request.timeline_id);
|
||||
}
|
||||
|
||||
// TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's
|
||||
// part of https support (#24836).
|
||||
let http_client = reqwest::Client::new();
|
||||
|
||||
let http_hosts = request.http_hosts.clone();
|
||||
|
||||
// Figure out statuses of potential donors.
|
||||
let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
|
||||
futures::future::join_all(http_hosts.iter().map(|url| async {
|
||||
let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
|
||||
let cclient = Client::new(url.clone(), sk_auth_token.clone());
|
||||
let info = cclient
|
||||
.timeline_status(request.tenant_id, request.timeline_id)
|
||||
.await?;
|
||||
@@ -464,10 +460,8 @@ async fn pull_timeline(
|
||||
let conf = &global_timelines.get_global_config();
|
||||
|
||||
let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
|
||||
// TODO(DimasKovas): add ssl root CA certificate when implementing safekeeper's
|
||||
// part of https support (#24836).
|
||||
let http_client = reqwest::Client::new();
|
||||
let client = Client::new(http_client, host.clone(), sk_auth_token.clone());
|
||||
|
||||
let client = Client::new(host.clone(), sk_auth_token.clone());
|
||||
// Request stream with basebackup archive.
|
||||
let bb_resp = client
|
||||
.snapshot(status.tenant_id, status.timeline_id, conf.my_id)
|
||||
|
||||
@@ -219,12 +219,12 @@ impl InterpretedWalReaderState {
|
||||
}
|
||||
}
|
||||
|
||||
fn replace_current_batch_wal_start(&mut self, with: Lsn) -> Lsn {
|
||||
fn take_current_batch_wal_start(&mut self) -> Lsn {
|
||||
match self {
|
||||
InterpretedWalReaderState::Running {
|
||||
current_batch_wal_start,
|
||||
..
|
||||
} => current_batch_wal_start.replace(with).unwrap(),
|
||||
} => current_batch_wal_start.take().unwrap(),
|
||||
InterpretedWalReaderState::Done => {
|
||||
panic!("take_current_batch_wal_start called on finished reader")
|
||||
}
|
||||
@@ -416,12 +416,10 @@ impl InterpretedWalReader {
|
||||
let shard_ids = self.shard_senders.keys().copied().collect::<Vec<_>>();
|
||||
let mut records_by_sender: HashMap<ShardSenderId, Vec<InterpretedWalRecord>> = HashMap::new();
|
||||
let mut max_next_record_lsn = None;
|
||||
let mut max_end_record_lsn = None;
|
||||
while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()?
|
||||
{
|
||||
assert!(next_record_lsn.is_aligned());
|
||||
max_next_record_lsn = Some(next_record_lsn);
|
||||
max_end_record_lsn = Some(wal_decoder.lsn());
|
||||
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
@@ -432,10 +430,7 @@ impl InterpretedWalReader {
|
||||
.with_context(|| "Failed to interpret WAL")?;
|
||||
|
||||
for (shard, record) in interpreted {
|
||||
// Shard zero needs to track the start LSN of the latest record
|
||||
// in adition to the LSN of the next record to ingest. The former
|
||||
// is included in basebackup persisted by the compute in WAL.
|
||||
if !shard.is_shard_zero() && record.is_empty() {
|
||||
if record.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -472,7 +467,7 @@ impl InterpretedWalReader {
|
||||
let batch_wal_start_lsn = {
|
||||
let mut guard = self.state.write().unwrap();
|
||||
guard.update_current_position(max_next_record_lsn);
|
||||
guard.replace_current_batch_wal_start(max_end_record_lsn.unwrap())
|
||||
guard.take_current_batch_wal_start()
|
||||
};
|
||||
|
||||
// Send interpreted records downstream. Anything that has already been seen
|
||||
@@ -745,7 +740,7 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None)
|
||||
let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
@@ -888,16 +883,10 @@ mod tests {
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let mut next_record_lsns = Vec::default();
|
||||
let end_watch = Env::write_wal(
|
||||
tli,
|
||||
start_lsn,
|
||||
SIZE,
|
||||
MSG_COUNT,
|
||||
c"neon-file:",
|
||||
Some(&mut next_record_lsns),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_watch =
|
||||
Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
@@ -1038,16 +1027,10 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let end_watch = Env::write_wal(
|
||||
tli,
|
||||
start_lsn,
|
||||
SIZE,
|
||||
MSG_COUNT,
|
||||
c"neon-file:",
|
||||
Some(&mut next_record_lsns),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_watch =
|
||||
Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert!(next_record_lsns.len() > 3);
|
||||
let shard_0_start_lsn = next_record_lsns[3];
|
||||
@@ -1141,88 +1124,4 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_shard_zero_does_not_skip_empty_records() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const MSG_COUNT: usize = 10;
|
||||
const PG_VERSION: u32 = 17;
|
||||
|
||||
let start_lsn = Lsn::from_str("0/149FD18").unwrap();
|
||||
let env = Env::new(true).unwrap();
|
||||
let tli = env
|
||||
.make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let resident_tli = tli.wal_residence_guard().await.unwrap();
|
||||
let mut next_record_lsns = Vec::new();
|
||||
let end_watch = Env::write_wal(
|
||||
tli,
|
||||
start_lsn,
|
||||
SIZE,
|
||||
MSG_COUNT,
|
||||
// This is a logical message prefix that is not persisted to key value storage.
|
||||
// We use it in order to validate that shard zero receives emtpy interpreted records.
|
||||
c"test:",
|
||||
Some(&mut next_record_lsns),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let end_pos = end_watch.get();
|
||||
|
||||
let streaming_wal_reader = StreamingWalReader::new(
|
||||
resident_tli,
|
||||
None,
|
||||
start_lsn,
|
||||
end_pos,
|
||||
end_watch,
|
||||
MAX_SEND_SIZE,
|
||||
);
|
||||
|
||||
let shard = ShardIdentity::unsharded();
|
||||
let (records_tx, mut records_rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
|
||||
|
||||
let handle = InterpretedWalReader::spawn(
|
||||
streaming_wal_reader,
|
||||
start_lsn,
|
||||
records_tx,
|
||||
shard,
|
||||
PG_VERSION,
|
||||
&Some("pageserver".to_string()),
|
||||
);
|
||||
|
||||
let mut interpreted_records = Vec::new();
|
||||
while let Some(batch) = records_rx.recv().await {
|
||||
interpreted_records.push(batch.records);
|
||||
if batch.wal_end_lsn == batch.available_wal_end_lsn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let received_next_record_lsns = interpreted_records
|
||||
.into_iter()
|
||||
.flat_map(|b| b.records)
|
||||
.map(|rec| rec.next_record_lsn)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// By default this also includes the start LSN. Trim it since it shouldn't be received.
|
||||
let next_record_lsns = next_record_lsns.into_iter().skip(1).collect::<Vec<_>>();
|
||||
|
||||
assert_eq!(received_next_record_lsns, next_record_lsns);
|
||||
|
||||
handle.abort();
|
||||
let mut done = false;
|
||||
for _ in 0..5 {
|
||||
if handle.current_position().is_none() {
|
||||
done = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1)).await;
|
||||
}
|
||||
|
||||
assert!(done);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::ffi::CStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use camino_tempfile::Utf8TempDir;
|
||||
@@ -125,7 +124,6 @@ impl Env {
|
||||
start_lsn: Lsn,
|
||||
msg_size: usize,
|
||||
msg_count: usize,
|
||||
prefix: &CStr,
|
||||
mut next_record_lsns: Option<&mut Vec<Lsn>>,
|
||||
) -> anyhow::Result<EndWatch> {
|
||||
let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
|
||||
@@ -135,6 +133,7 @@ impl Env {
|
||||
|
||||
WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
|
||||
|
||||
let prefix = c"neon-file:";
|
||||
let prefixlen = prefix.to_bytes_with_nul().len();
|
||||
assert!(msg_size >= prefixlen);
|
||||
let message = vec![0; msg_size - prefixlen];
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user