mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 10:52:55 +00:00
Compare commits
257 Commits
statement_
...
release-47
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b9238059d6 | ||
|
|
d0cb4b88c8 | ||
|
|
1ec3e39d4e | ||
|
|
a1a74eef2c | ||
|
|
90e689adda | ||
|
|
f0b2d4b053 | ||
|
|
299d9474c9 | ||
|
|
7234208b36 | ||
|
|
93450f11f5 | ||
|
|
2f0f9edf33 | ||
|
|
d424f2b7c8 | ||
|
|
21315e80bc | ||
|
|
483b66d383 | ||
|
|
aa72a22661 | ||
|
|
5c0264b591 | ||
|
|
9f13277729 | ||
|
|
54aa319805 | ||
|
|
4a227484bf | ||
|
|
2f83f85291 | ||
|
|
d6cfcb0d93 | ||
|
|
392843ad2a | ||
|
|
bd4dae8f4a | ||
|
|
b05fe53cfd | ||
|
|
c13a2f0df1 | ||
|
|
39be366fc5 | ||
|
|
6eda0a3158 | ||
|
|
306c7a1813 | ||
|
|
80be423a58 | ||
|
|
5dcfef82f2 | ||
|
|
e67b8f69c0 | ||
|
|
e546872ab4 | ||
|
|
322ea1cf7c | ||
|
|
3633742de9 | ||
|
|
079d3a37ba | ||
|
|
a46e77b476 | ||
|
|
a92702b01e | ||
|
|
8ff3253f20 | ||
|
|
04b82c92a7 | ||
|
|
e5bf423e68 | ||
|
|
60af392e45 | ||
|
|
661fc41e71 | ||
|
|
702c488f32 | ||
|
|
45c5122754 | ||
|
|
558394f710 | ||
|
|
73b0898608 | ||
|
|
e65be4c2dc | ||
|
|
40087b8164 | ||
|
|
c762b59483 | ||
|
|
5d71601ca9 | ||
|
|
a113c3e433 | ||
|
|
e81fc598f4 | ||
|
|
48b845fa76 | ||
|
|
27096858dc | ||
|
|
4430d0ae7d | ||
|
|
6e183aa0de | ||
|
|
fd6d0b7635 | ||
|
|
3710c32aae | ||
|
|
be83bee49d | ||
|
|
cf28e5922a | ||
|
|
7d384d6953 | ||
|
|
4b3b37b912 | ||
|
|
1d8d200f4d | ||
|
|
0d80d6ce18 | ||
|
|
f653ee039f | ||
|
|
e614a95853 | ||
|
|
850db4cc13 | ||
|
|
8a316b1277 | ||
|
|
4d13bae449 | ||
|
|
49377abd98 | ||
|
|
a6b2f4e54e | ||
|
|
face60d50b | ||
|
|
9768aa27f2 | ||
|
|
96b2e575e1 | ||
|
|
7222777784 | ||
|
|
5469fdede0 | ||
|
|
72aa6b9fdd | ||
|
|
ae0634b7be | ||
|
|
70711f32fa | ||
|
|
52a88af0aa | ||
|
|
b7a43bf817 | ||
|
|
dce91b33a4 | ||
|
|
23ee4f3050 | ||
|
|
46857e8282 | ||
|
|
368ab0ce54 | ||
|
|
a5987eebfd | ||
|
|
6686ede30f | ||
|
|
373c7057cc | ||
|
|
7d6ec16166 | ||
|
|
0e6fdc8a58 | ||
|
|
521438a5c6 | ||
|
|
07d7874bc8 | ||
|
|
1804111a02 | ||
|
|
cd0178efed | ||
|
|
333574be57 | ||
|
|
79a799a143 | ||
|
|
9da06af6c9 | ||
|
|
ce1753d036 | ||
|
|
67db8432b4 | ||
|
|
4e2e44e524 | ||
|
|
ed786104f3 | ||
|
|
84b74f2bd1 | ||
|
|
fec2ad6283 | ||
|
|
98eebd4682 | ||
|
|
2f74287c9b | ||
|
|
aee1bf95e3 | ||
|
|
b9de9d75ff | ||
|
|
7943b709e6 | ||
|
|
d7d066d493 | ||
|
|
e78ac22107 | ||
|
|
76a8f2bb44 | ||
|
|
8d59a8581f | ||
|
|
b1ddd01289 | ||
|
|
6eae4fc9aa | ||
|
|
765455bca2 | ||
|
|
4204960942 | ||
|
|
67345d66ea | ||
|
|
2266ee5971 | ||
|
|
b58445d855 | ||
|
|
36050e7f3d | ||
|
|
33360ed96d | ||
|
|
39a28d1108 | ||
|
|
efa6aa134f | ||
|
|
2c724e56e2 | ||
|
|
feff887c6f | ||
|
|
353d915fcf | ||
|
|
2e38098cbc | ||
|
|
a6fe5ea1ac | ||
|
|
05b0aed0c1 | ||
|
|
cd1705357d | ||
|
|
6bc7561290 | ||
|
|
fbd3ac14b5 | ||
|
|
e437787c8f | ||
|
|
3460dbf90b | ||
|
|
6b89d99677 | ||
|
|
6cc8ea86e4 | ||
|
|
e62a492d6f | ||
|
|
a475cdf642 | ||
|
|
7002c79a47 | ||
|
|
ee6cf357b4 | ||
|
|
e5c2086b5f | ||
|
|
5f1208296a | ||
|
|
88e8e473cd | ||
|
|
b0a77844f6 | ||
|
|
1baf464307 | ||
|
|
e9b8e81cea | ||
|
|
85d6194aa4 | ||
|
|
333a7a68ef | ||
|
|
6aa4e41bee | ||
|
|
840183e51f | ||
|
|
cbccc94b03 | ||
|
|
fce227df22 | ||
|
|
bd787e800f | ||
|
|
4a7704b4a3 | ||
|
|
ff1119da66 | ||
|
|
4c3ba1627b | ||
|
|
1407174fb2 | ||
|
|
ec9dcb1889 | ||
|
|
d11d781afc | ||
|
|
4e44565b71 | ||
|
|
4ed51ad33b | ||
|
|
1c1ebe5537 | ||
|
|
c19cb7f386 | ||
|
|
4b97d31b16 | ||
|
|
923ade3dd7 | ||
|
|
b04e711975 | ||
|
|
afd0a6b39a | ||
|
|
99752286d8 | ||
|
|
15df93363c | ||
|
|
bc0ab741af | ||
|
|
51d9dfeaa3 | ||
|
|
f63cb18155 | ||
|
|
0de603d88e | ||
|
|
240913912a | ||
|
|
91a4ea0de2 | ||
|
|
8608704f49 | ||
|
|
efef68ce99 | ||
|
|
8daefd24da | ||
|
|
46cc8b7982 | ||
|
|
38cd90dd0c | ||
|
|
a51b269f15 | ||
|
|
43bf6d0a0f | ||
|
|
15273a9b66 | ||
|
|
78aca668d0 | ||
|
|
acbf4148ea | ||
|
|
6508540561 | ||
|
|
a41b5244a8 | ||
|
|
2b3189be95 | ||
|
|
248563c595 | ||
|
|
14cd6ca933 | ||
|
|
eb36403e71 | ||
|
|
3c6f779698 | ||
|
|
f67f0c1c11 | ||
|
|
edb02d3299 | ||
|
|
664a69e65b | ||
|
|
478322ebf9 | ||
|
|
802f174072 | ||
|
|
47f9890bae | ||
|
|
262265daad | ||
|
|
300da5b872 | ||
|
|
7b22b5c433 | ||
|
|
ffca97bc1e | ||
|
|
cb356f3259 | ||
|
|
c85374295f | ||
|
|
4992160677 | ||
|
|
bd535b3371 | ||
|
|
d90c5a03af | ||
|
|
2d02cc9079 | ||
|
|
49ad94b99f | ||
|
|
948a217398 | ||
|
|
125381eae7 | ||
|
|
cd01bbc715 | ||
|
|
d8b5e3b88d | ||
|
|
06d25f2186 | ||
|
|
f759b561f3 | ||
|
|
ece0555600 | ||
|
|
73ea0a0b01 | ||
|
|
d8f6d6fd6f | ||
|
|
d24de169a7 | ||
|
|
0816168296 | ||
|
|
277b44d57a | ||
|
|
68c2c3880e | ||
|
|
49da498f65 | ||
|
|
2c76ba3dd7 | ||
|
|
dbe3dc69ad | ||
|
|
8e5bb3ed49 | ||
|
|
ab0be7b8da | ||
|
|
b4c55f5d24 | ||
|
|
ede70d833c | ||
|
|
70c3d18bb0 | ||
|
|
7a491f52c4 | ||
|
|
323c4ecb4f | ||
|
|
3d2466607e | ||
|
|
ed478b39f4 | ||
|
|
91585a558d | ||
|
|
93467eae1f | ||
|
|
f3aac81d19 | ||
|
|
979ad60c19 | ||
|
|
9316cb1b1f | ||
|
|
e7939a527a | ||
|
|
36d26665e1 | ||
|
|
873347f977 | ||
|
|
e814ac16f9 | ||
|
|
ad3055d386 | ||
|
|
94e03eb452 | ||
|
|
380f26ef79 | ||
|
|
3c5b7f59d7 | ||
|
|
fee89f80b5 | ||
|
|
41cce8eaf1 | ||
|
|
f88fe0218d | ||
|
|
cc856eca85 | ||
|
|
cf350c6002 | ||
|
|
0ce6b6a0a3 | ||
|
|
73f247d537 | ||
|
|
960be82183 | ||
|
|
806e5a6c19 | ||
|
|
8d5df07cce | ||
|
|
df7a9d1407 |
@@ -44,10 +44,6 @@ inputs:
|
||||
description: 'Postgres version to use for tests'
|
||||
required: false
|
||||
default: 'v14'
|
||||
benchmark_durations:
|
||||
description: 'benchmark durations JSON'
|
||||
required: false
|
||||
default: '{}'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -164,7 +160,7 @@ runs:
|
||||
# We use pytest-split plugin to run benchmarks in parallel on different CI runners
|
||||
if [ "${TEST_SELECTION}" = "test_runner/performance" ] && [ "${{ inputs.build_type }}" != "remote" ]; then
|
||||
mkdir -p $TEST_OUTPUT
|
||||
echo '${{ inputs.benchmark_durations || '{}' }}' > $TEST_OUTPUT/benchmark_durations.json
|
||||
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" --days 10 --output "$TEST_OUTPUT/benchmark_durations.json"
|
||||
|
||||
EXTRA_PARAMS="--durations-path $TEST_OUTPUT/benchmark_durations.json $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
41
.github/workflows/build_and_test.yml
vendored
41
.github/workflows/build_and_test.yml
vendored
@@ -132,7 +132,7 @@ jobs:
|
||||
|
||||
check-codestyle-rust:
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
runs-on: [ self-hosted, gen3, large ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
@@ -478,40 +478,8 @@ jobs:
|
||||
if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
get-benchmarks-durations:
|
||||
outputs:
|
||||
json: ${{ steps.get-benchmark-durations.outputs.json }}
|
||||
needs: [ check-permissions, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
options: --init
|
||||
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: get benchmark durations
|
||||
id: get-benchmark-durations
|
||||
env:
|
||||
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
|
||||
run: |
|
||||
poetry run ./scripts/benchmark_durations.py "${TEST_RESULT_CONNSTR}" \
|
||||
--days 10 \
|
||||
--output /tmp/benchmark_durations.json
|
||||
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
|
||||
|
||||
benchmarks:
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image, get-benchmarks-durations ]
|
||||
needs: [ check-permissions, build-neon, build-buildtools-image ]
|
||||
runs-on: [ self-hosted, gen3, small ]
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${{ needs.build-buildtools-image.outputs.build-tools-tag }}
|
||||
@@ -522,7 +490,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# the amount of groups (N) should be reflected in `extra_params: --splits N ...`
|
||||
pytest_split_group: [ 1, 2, 3, 4, 5 ]
|
||||
pytest_split_group: [ 1, 2, 3, 4 ]
|
||||
build_type: [ release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -535,8 +503,7 @@ jobs:
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ github.ref_name == 'main' }}
|
||||
extra_params: --splits 5 --group ${{ matrix.pytest_split_group }}
|
||||
benchmark_durations: ${{ needs.get-benchmarks-durations.outputs.json }}
|
||||
extra_params: --splits 4 --group ${{ matrix.pytest_split_group }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
14
Cargo.lock
generated
14
Cargo.lock
generated
@@ -281,7 +281,6 @@ dependencies = [
|
||||
"clap",
|
||||
"control_plane",
|
||||
"diesel",
|
||||
"diesel_migrations",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hyper",
|
||||
@@ -2719,16 +2718,6 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lasso"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
|
||||
dependencies = [
|
||||
"dashmap",
|
||||
"hashbrown 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
@@ -4086,7 +4075,6 @@ dependencies = [
|
||||
"hyper-tungstenite",
|
||||
"ipnet",
|
||||
"itertools",
|
||||
"lasso",
|
||||
"md5",
|
||||
"metrics",
|
||||
"native-tls",
|
||||
@@ -4103,7 +4091,6 @@ dependencies = [
|
||||
"pq_proto",
|
||||
"prometheus",
|
||||
"rand 0.8.5",
|
||||
"rand_distr",
|
||||
"rcgen",
|
||||
"redis",
|
||||
"regex",
|
||||
@@ -6816,7 +6803,6 @@ dependencies = [
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"getrandom 0.2.11",
|
||||
"hashbrown 0.13.2",
|
||||
"hashbrown 0.14.0",
|
||||
"hex",
|
||||
"hmac",
|
||||
|
||||
@@ -95,7 +95,6 @@ inotify = "0.10.2"
|
||||
ipnet = "2.9.0"
|
||||
itertools = "0.10"
|
||||
jsonwebtoken = "9"
|
||||
lasso = "0.7"
|
||||
libc = "0.2"
|
||||
md5 = "0.7.0"
|
||||
memoffset = "0.8"
|
||||
|
||||
@@ -207,7 +207,6 @@ fn maybe_cgexec(cmd: &str) -> Command {
|
||||
|
||||
/// Create special neon_superuser role, that's a slightly nerfed version of a real superuser
|
||||
/// that we give to customers
|
||||
#[instrument(skip_all)]
|
||||
fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
let roles = spec
|
||||
.cluster
|
||||
|
||||
@@ -138,34 +138,6 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
}
|
||||
}
|
||||
//
|
||||
// Don't suspend compute if there is an active logical replication subscription
|
||||
//
|
||||
// `where pid is not null` – to filter out read only computes and subscription on branches
|
||||
//
|
||||
let logical_subscriptions_query =
|
||||
"select count(*) from pg_stat_subscription where pid is not null;";
|
||||
match cli.query_one(logical_subscriptions_query, &[]) {
|
||||
Ok(row) => match row.try_get::<&str, i64>("count") {
|
||||
Ok(num_subscribers) => {
|
||||
if num_subscribers > 0 {
|
||||
compute.update_last_active(Some(Utc::now()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"failed to get list of active logical replication subscriptions: {:?}",
|
||||
e
|
||||
);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
//
|
||||
// Do not suspend compute if autovacuum is running
|
||||
//
|
||||
let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
|
||||
|
||||
@@ -25,7 +25,6 @@ tokio-util.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
diesel = { version = "2.1.4", features = ["serde_json", "postgres"] }
|
||||
diesel_migrations = { version = "2.1.0" }
|
||||
|
||||
utils = { path = "../../libs/utils/" }
|
||||
metrics = { path = "../../libs/metrics/" }
|
||||
|
||||
@@ -244,11 +244,9 @@ impl ComputeHook {
|
||||
3,
|
||||
10,
|
||||
"Send compute notification",
|
||||
cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || NotifyError::ShuttingDown),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| NotifyError::ShuttingDown)
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
/// Call this to notify the compute (postgres) tier of new pageservers to use
|
||||
|
||||
@@ -403,6 +403,10 @@ pub fn make_router(
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
// Tenant Shard operations (low level/maintenance)
|
||||
.put("/tenant/:tenant_shard_id/migrate", |r| {
|
||||
tenant_service_handler(r, handle_tenant_shard_migrate)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
@@ -411,7 +415,7 @@ pub fn make_router(
|
||||
tenant_service_handler(r, handle_tenant_timeline_create)
|
||||
})
|
||||
// Tenant detail GET passthrough to shard zero
|
||||
.get("/v1/tenant/:tenant_id", |r| {
|
||||
.get("/v1/tenant/:tenant_id*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Timeline GET passthrough to shard zero. Note that the `*` in the URL is a wildcard: any future
|
||||
@@ -419,4 +423,8 @@ pub fn make_router(
|
||||
.get("/v1/tenant/:tenant_id/timeline*", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_passthrough)
|
||||
})
|
||||
// Path aliases for tests_forward_compatibility
|
||||
// TODO: remove these in future PR
|
||||
.post("/re-attach", |r| request_span(r, handle_re_attach))
|
||||
.post("/validate", |r| request_span(r, handle_validate))
|
||||
}
|
||||
|
||||
@@ -4,14 +4,13 @@
|
||||
/// This enables running & testing pageservers without a full-blown
|
||||
/// deployment of the Neon cloud platform.
|
||||
///
|
||||
use anyhow::{anyhow, Context};
|
||||
use anyhow::anyhow;
|
||||
use attachment_service::http::make_router;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use aws_config::{self, BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
use metrics::launch_timestamp::LaunchTimestamp;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal::unix::SignalKind;
|
||||
@@ -23,9 +22,6 @@ use utils::{project_build_tag, project_git_version, tcp_listener};
|
||||
project_git_version!(GIT_VERSION);
|
||||
project_build_tag!(BUILD_TAG);
|
||||
|
||||
use diesel_migrations::{embed_migrations, EmbeddedMigrations};
|
||||
pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("./migrations");
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
#[command(arg_required_else_help(true))]
|
||||
@@ -34,9 +30,9 @@ struct Cli {
|
||||
#[arg(short, long)]
|
||||
listen: std::net::SocketAddr,
|
||||
|
||||
/// Public key for JWT authentication of clients
|
||||
/// Path to public key for JWT authentication of clients
|
||||
#[arg(long)]
|
||||
public_key: Option<String>,
|
||||
public_key: Option<camino::Utf8PathBuf>,
|
||||
|
||||
/// Token for authenticating this service with the pageservers it controls
|
||||
#[arg(long)]
|
||||
@@ -57,7 +53,7 @@ struct Cli {
|
||||
|
||||
/// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
|
||||
#[arg(long)]
|
||||
database_url: Option<String>,
|
||||
database_url: String,
|
||||
}
|
||||
|
||||
/// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
|
||||
@@ -78,9 +74,10 @@ impl Secrets {
|
||||
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
||||
|
||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||
match &args.database_url {
|
||||
Some(url) => Self::load_cli(url, args),
|
||||
None => Self::load_aws_sm().await,
|
||||
if args.database_url.is_empty() {
|
||||
Self::load_aws_sm().await
|
||||
} else {
|
||||
Self::load_cli(args)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -156,13 +153,13 @@ impl Secrets {
|
||||
})
|
||||
}
|
||||
|
||||
fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
|
||||
fn load_cli(args: &Cli) -> anyhow::Result<Self> {
|
||||
let public_key = match &args.public_key {
|
||||
None => None,
|
||||
Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
|
||||
Some(key_path) => Some(JwtAuth::from_key_path(key_path)?),
|
||||
};
|
||||
Ok(Self {
|
||||
database_url: database_url.to_owned(),
|
||||
database_url: args.database_url.clone(),
|
||||
public_key,
|
||||
jwt_token: args.jwt_token.clone(),
|
||||
control_plane_jwt_token: args.control_plane_jwt_token.clone(),
|
||||
@@ -170,19 +167,6 @@ impl Secrets {
|
||||
}
|
||||
}
|
||||
|
||||
async fn migration_run(database_url: &str) -> anyhow::Result<()> {
|
||||
use diesel::PgConnection;
|
||||
use diesel_migrations::{HarnessWithOutput, MigrationHarness};
|
||||
let mut conn = PgConnection::establish(database_url)?;
|
||||
|
||||
HarnessWithOutput::write_to_stdout(&mut conn)
|
||||
.run_pending_migrations(MIGRATIONS)
|
||||
.map(|_| ())
|
||||
.map_err(|e| anyhow::anyhow!(e))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
|
||||
@@ -211,11 +195,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
compute_hook_url: args.compute_hook_url,
|
||||
};
|
||||
|
||||
// After loading secrets & config, but before starting anything else, apply database migrations
|
||||
migration_run(&secrets.database_url)
|
||||
.await
|
||||
.context("Running database migrations")?;
|
||||
|
||||
let json_path = args.path;
|
||||
let persistence = Arc::new(Persistence::new(secrets.database_url, json_path.clone()));
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ pub struct AttachmentService {
|
||||
listen: String,
|
||||
path: Utf8PathBuf,
|
||||
jwt_token: Option<String>,
|
||||
public_key: Option<String>,
|
||||
public_key_path: Option<Utf8PathBuf>,
|
||||
postgres_port: u16,
|
||||
client: reqwest::Client,
|
||||
}
|
||||
@@ -207,7 +207,7 @@ impl AttachmentService {
|
||||
.pageservers
|
||||
.first()
|
||||
.expect("Config is validated to contain at least one pageserver");
|
||||
let (jwt_token, public_key) = match ps_conf.http_auth_type {
|
||||
let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
|
||||
AuthType::Trust => (None, None),
|
||||
AuthType::NeonJWT => {
|
||||
let jwt_token = env
|
||||
@@ -219,26 +219,7 @@ impl AttachmentService {
|
||||
let public_key_path =
|
||||
camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
|
||||
.unwrap();
|
||||
|
||||
// This service takes keys as a string rather than as a path to a file/dir: read the key into memory.
|
||||
let public_key = if std::fs::metadata(&public_key_path)
|
||||
.expect("Can't stat public key")
|
||||
.is_dir()
|
||||
{
|
||||
// Our config may specify a directory: this is for the pageserver's ability to handle multiple
|
||||
// keys. We only use one key at a time, so, arbitrarily load the first one in the directory.
|
||||
let mut dir =
|
||||
std::fs::read_dir(&public_key_path).expect("Can't readdir public key path");
|
||||
let dent = dir
|
||||
.next()
|
||||
.expect("Empty key dir")
|
||||
.expect("Error reading key dir");
|
||||
|
||||
std::fs::read_to_string(dent.path()).expect("Can't read public key")
|
||||
} else {
|
||||
std::fs::read_to_string(&public_key_path).expect("Can't read public key")
|
||||
};
|
||||
(Some(jwt_token), Some(public_key))
|
||||
(Some(jwt_token), Some(public_key_path))
|
||||
}
|
||||
};
|
||||
|
||||
@@ -247,7 +228,7 @@ impl AttachmentService {
|
||||
path,
|
||||
listen,
|
||||
jwt_token,
|
||||
public_key,
|
||||
public_key_path,
|
||||
postgres_port,
|
||||
client: reqwest::ClientBuilder::new()
|
||||
.build()
|
||||
@@ -472,8 +453,8 @@ impl AttachmentService {
|
||||
args.push(format!("--jwt-token={jwt_token}"));
|
||||
}
|
||||
|
||||
if let Some(public_key) = &self.public_key {
|
||||
args.push(format!("--public-key=\"{public_key}\""));
|
||||
if let Some(public_key_path) = &self.public_key_path {
|
||||
args.push(format!("--public-key={public_key_path}"));
|
||||
}
|
||||
|
||||
if let Some(control_plane_compute_hook_api) = &self.env.control_plane_compute_hook_api {
|
||||
|
||||
@@ -379,7 +379,7 @@ impl RemoteStorage for AzureBlobStorage {
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: &CancellationToken,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
// TODO use Azure point in time recovery feature for this
|
||||
// https://learn.microsoft.com/en-us/azure/storage/blobs/point-in-time-restore-overview
|
||||
|
||||
@@ -218,7 +218,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError>;
|
||||
}
|
||||
|
||||
@@ -442,7 +442,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => {
|
||||
|
||||
@@ -431,7 +431,7 @@ impl RemoteStorage for LocalFs {
|
||||
_prefix: Option<&RemotePath>,
|
||||
_timestamp: SystemTime,
|
||||
_done_if_after: SystemTime,
|
||||
_cancel: &CancellationToken,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
Err(TimeTravelError::Unimplemented)
|
||||
}
|
||||
|
||||
@@ -638,7 +638,7 @@ impl RemoteStorage for S3Bucket {
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
let kind = RequestKind::TimeTravel;
|
||||
let _guard = self.permit(kind).await;
|
||||
@@ -678,11 +678,9 @@ impl RemoteStorage for S3Bucket {
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"listing object versions for time_travel_recover",
|
||||
cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| TimeTravelError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
.await?;
|
||||
|
||||
tracing::trace!(
|
||||
" Got List response version_id_marker={:?}, key_marker={:?}",
|
||||
@@ -807,11 +805,9 @@ impl RemoteStorage for S3Bucket {
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"copying object version for time_travel_recover",
|
||||
cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| TimeTravelError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
.await?;
|
||||
tracing::info!(%version_id, %key, "Copied old version in S3");
|
||||
}
|
||||
VerOrDelete {
|
||||
|
||||
@@ -190,7 +190,7 @@ impl RemoteStorage for UnreliableWrapper {
|
||||
prefix: Option<&RemotePath>,
|
||||
timestamp: SystemTime,
|
||||
done_if_after: SystemTime,
|
||||
cancel: &CancellationToken,
|
||||
cancel: CancellationToken,
|
||||
) -> Result<(), TimeTravelError> {
|
||||
self.attempt(RemoteOp::TimeTravelRecover(prefix.map(|p| p.to_owned())))
|
||||
.map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
|
||||
|
||||
@@ -56,10 +56,9 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
warn_threshold,
|
||||
max_retries,
|
||||
"test retry",
|
||||
&CancellationToken::new(),
|
||||
backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
|
||||
)
|
||||
.await
|
||||
.expect("never cancelled")
|
||||
}
|
||||
|
||||
async fn time_point() -> SystemTime {
|
||||
@@ -77,8 +76,6 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
.collect::<HashSet<_>>())
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
|
||||
.with_context(|| "RemotePath conversion")?;
|
||||
|
||||
@@ -145,7 +142,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// No changes after recovery to t2 (no-op)
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t2, t_final, &cancel)
|
||||
.time_travel_recover(None, t2, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t2_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t2: {t2_files_recovered:?}");
|
||||
@@ -156,7 +153,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// after recovery to t1: path1 is back, path2 has the old content
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t1, t_final, &cancel)
|
||||
.time_travel_recover(None, t1, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t1_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t1: {t1_files_recovered:?}");
|
||||
@@ -167,7 +164,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
// after recovery to t0: everything is gone except for path1
|
||||
let t_final = time_point().await;
|
||||
ctx.client
|
||||
.time_travel_recover(None, t0, t_final, &cancel)
|
||||
.time_travel_recover(None, t0, t_final, CancellationToken::new())
|
||||
.await?;
|
||||
let t0_files_recovered = list_files(&ctx.client).await?;
|
||||
println!("after recovery to t0: {t0_files_recovered:?}");
|
||||
|
||||
@@ -37,53 +37,69 @@ pub fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_sec
|
||||
}
|
||||
}
|
||||
|
||||
/// Retries passed operation until one of the following conditions are met:
|
||||
/// - encountered error is considered as permanent (non-retryable)
|
||||
/// - retries have been exhausted
|
||||
/// - cancellation token has been cancelled
|
||||
///
|
||||
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent
|
||||
/// errors. When attempts cross `warn_threshold` function starts to emit log warnings.
|
||||
/// Configure cancellation for a retried operation: when to cancel (the token), and
|
||||
/// what kind of error to return on cancellation
|
||||
pub struct Cancel<E, CF>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
token: CancellationToken,
|
||||
on_cancel: CF,
|
||||
}
|
||||
|
||||
impl<E, CF> Cancel<E, CF>
|
||||
where
|
||||
E: Display + Debug + 'static,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
pub fn new(token: CancellationToken, on_cancel: CF) -> Self {
|
||||
Self { token, on_cancel }
|
||||
}
|
||||
}
|
||||
|
||||
/// retries passed operation until one of the following conditions are met:
|
||||
/// Encountered error is considered as permanent (non-retryable)
|
||||
/// Retries have been exhausted.
|
||||
/// `is_permanent` closure should be used to provide distinction between permanent/non-permanent errors
|
||||
/// When attempts cross `warn_threshold` function starts to emit log warnings.
|
||||
/// `description` argument is added to log messages. Its value should identify the `op` is doing
|
||||
/// `cancel` cancels new attempts and the backoff sleep.
|
||||
///
|
||||
/// If attempts fail, they are being logged with `{:#}` which works for anyhow, but does not work
|
||||
/// for any other error type. Final failed attempt is logged with `{:?}`.
|
||||
///
|
||||
/// Returns `None` if cancellation was noticed during backoff or the terminal result.
|
||||
pub async fn retry<T, O, F, E>(
|
||||
/// `cancel` argument is required: any time we are looping on retry, we should be using a CancellationToken
|
||||
/// to drop out promptly on shutdown.
|
||||
pub async fn retry<T, O, F, E, CF>(
|
||||
mut op: O,
|
||||
is_permanent: impl Fn(&E) -> bool,
|
||||
warn_threshold: u32,
|
||||
max_retries: u32,
|
||||
description: &str,
|
||||
cancel: &CancellationToken,
|
||||
) -> Option<Result<T, E>>
|
||||
cancel: Cancel<E, CF>,
|
||||
) -> Result<T, E>
|
||||
where
|
||||
// Not std::error::Error because anyhow::Error doesnt implement it.
|
||||
// For context see https://github.com/dtolnay/anyhow/issues/63
|
||||
E: Display + Debug + 'static,
|
||||
O: FnMut() -> F,
|
||||
F: Future<Output = Result<T, E>>,
|
||||
CF: Fn() -> E,
|
||||
{
|
||||
let mut attempts = 0;
|
||||
loop {
|
||||
if cancel.is_cancelled() {
|
||||
return None;
|
||||
if cancel.token.is_cancelled() {
|
||||
return Err((cancel.on_cancel)());
|
||||
}
|
||||
|
||||
let result = op().await;
|
||||
match &result {
|
||||
match result {
|
||||
Ok(_) => {
|
||||
if attempts > 0 {
|
||||
tracing::info!("{description} succeeded after {attempts} retries");
|
||||
}
|
||||
return Some(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// These are "permanent" errors that should not be retried.
|
||||
Err(e) if is_permanent(e) => {
|
||||
return Some(result);
|
||||
Err(ref e) if is_permanent(e) => {
|
||||
return result;
|
||||
}
|
||||
// Assume that any other failure might be transient, and the operation might
|
||||
// succeed if we just keep trying.
|
||||
@@ -93,12 +109,12 @@ where
|
||||
Err(err) if attempts < max_retries => {
|
||||
tracing::warn!("{description} failed, will retry (attempt {attempts}): {err:#}");
|
||||
}
|
||||
Err(err) => {
|
||||
Err(ref err) => {
|
||||
// Operation failed `max_attempts` times. Time to give up.
|
||||
tracing::warn!(
|
||||
"{description} still failed after {attempts} retries, giving up: {err:?}"
|
||||
);
|
||||
return Some(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// sleep and retry
|
||||
@@ -106,7 +122,7 @@ where
|
||||
attempts,
|
||||
DEFAULT_BASE_BACKOFF_SECONDS,
|
||||
DEFAULT_MAX_BACKOFF_SECONDS,
|
||||
cancel,
|
||||
&cancel.token,
|
||||
)
|
||||
.await;
|
||||
attempts += 1;
|
||||
@@ -115,10 +131,12 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io;
|
||||
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_defaults_produce_growing_backoff_sequence() {
|
||||
let mut current_backoff_value = None;
|
||||
@@ -148,7 +166,7 @@ mod tests {
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn retry_always_error() {
|
||||
let count = Mutex::new(0);
|
||||
retry(
|
||||
let err_result = retry(
|
||||
|| async {
|
||||
*count.lock().await += 1;
|
||||
Result::<(), io::Error>::Err(io::Error::from(io::ErrorKind::Other))
|
||||
@@ -157,11 +175,11 @@ mod tests {
|
||||
1,
|
||||
1,
|
||||
"work",
|
||||
&CancellationToken::new(),
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await
|
||||
.expect("not cancelled")
|
||||
.expect_err("it can only fail");
|
||||
.await;
|
||||
|
||||
assert!(err_result.is_err());
|
||||
|
||||
assert_eq!(*count.lock().await, 2);
|
||||
}
|
||||
@@ -183,11 +201,10 @@ mod tests {
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
&CancellationToken::new(),
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await
|
||||
.expect("not cancelled")
|
||||
.expect("success on second try");
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
@@ -207,11 +224,10 @@ mod tests {
|
||||
2,
|
||||
2,
|
||||
"work",
|
||||
&CancellationToken::new(),
|
||||
Cancel::new(CancellationToken::new(), || -> io::Error { unreachable!() }),
|
||||
)
|
||||
.await
|
||||
.expect("was not cancellation")
|
||||
.expect_err("it was permanent error");
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(*count.lock().await, 1);
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
Arc, Mutex, MutexGuard,
|
||||
};
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
@@ -12,7 +12,7 @@ use tokio::sync::Semaphore;
|
||||
///
|
||||
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
|
||||
pub struct OnceCell<T> {
|
||||
inner: tokio::sync::RwLock<Inner<T>>,
|
||||
inner: Mutex<Inner<T>>,
|
||||
initializers: AtomicUsize,
|
||||
}
|
||||
|
||||
@@ -50,7 +50,7 @@ impl<T> OnceCell<T> {
|
||||
let sem = Semaphore::new(1);
|
||||
sem.close();
|
||||
Self {
|
||||
inner: tokio::sync::RwLock::new(Inner {
|
||||
inner: Mutex::new(Inner {
|
||||
init_semaphore: Arc::new(sem),
|
||||
value: Some(value),
|
||||
}),
|
||||
@@ -61,18 +61,18 @@ impl<T> OnceCell<T> {
|
||||
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
|
||||
/// returning the guard.
|
||||
///
|
||||
/// Initializing might wait on any existing [`GuardMut::take_and_deinit`] deinitialization.
|
||||
/// Initializing might wait on any existing [`Guard::take_and_deinit`] deinitialization.
|
||||
///
|
||||
/// Initialization is panic-safe and cancellation-safe.
|
||||
pub async fn get_mut_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardMut<'_, T>, E>
|
||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
|
||||
where
|
||||
F: FnOnce(InitPermit) -> Fut,
|
||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
||||
{
|
||||
let sem = {
|
||||
let guard = self.inner.write().await;
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
return Ok(GuardMut(guard));
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
@@ -88,72 +88,29 @@ impl<T> OnceCell<T> {
|
||||
let permit = InitPermit(permit);
|
||||
let (value, _permit) = factory(permit).await?;
|
||||
|
||||
let guard = self.inner.write().await;
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
Ok(Self::set0(value, guard))
|
||||
}
|
||||
Err(_closed) => {
|
||||
let guard = self.inner.write().await;
|
||||
let guard = self.inner.lock().unwrap();
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(GuardMut(guard));
|
||||
return Ok(Guard(guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, or uniquely initializes the value before
|
||||
/// returning the guard.
|
||||
///
|
||||
/// Initialization is panic-safe and cancellation-safe.
|
||||
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<GuardRef<'_, T>, E>
|
||||
where
|
||||
F: FnOnce(InitPermit) -> Fut,
|
||||
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
|
||||
{
|
||||
let sem = {
|
||||
let guard = self.inner.read().await;
|
||||
if guard.value.is_some() {
|
||||
return Ok(GuardRef(guard));
|
||||
}
|
||||
guard.init_semaphore.clone()
|
||||
};
|
||||
|
||||
let permit = {
|
||||
// increment the count for the duration of queued
|
||||
let _guard = CountWaitingInitializers::start(self);
|
||||
sem.acquire_owned().await
|
||||
};
|
||||
|
||||
match permit {
|
||||
Ok(permit) => {
|
||||
let permit = InitPermit(permit);
|
||||
let (value, _permit) = factory(permit).await?;
|
||||
|
||||
let guard = self.inner.write().await;
|
||||
|
||||
Ok(Self::set0(value, guard).downgrade())
|
||||
}
|
||||
Err(_closed) => {
|
||||
let guard = self.inner.read().await;
|
||||
assert!(
|
||||
guard.value.is_some(),
|
||||
"semaphore got closed, must be initialized"
|
||||
);
|
||||
return Ok(GuardRef(guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Assuming a permit is held after previous call to [`GuardMut::take_and_deinit`], it can be used
|
||||
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
|
||||
/// to complete initializing the inner value.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the inner has already been initialized.
|
||||
pub async fn set(&self, value: T, _permit: InitPermit) -> GuardMut<'_, T> {
|
||||
let guard = self.inner.write().await;
|
||||
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
|
||||
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
|
||||
// give more permits right now.
|
||||
@@ -165,31 +122,21 @@ impl<T> OnceCell<T> {
|
||||
Self::set0(value, guard)
|
||||
}
|
||||
|
||||
fn set0(value: T, mut guard: tokio::sync::RwLockWriteGuard<'_, Inner<T>>) -> GuardMut<'_, T> {
|
||||
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
|
||||
if guard.value.is_some() {
|
||||
drop(guard);
|
||||
unreachable!("we won permit, must not be initialized");
|
||||
}
|
||||
guard.value = Some(value);
|
||||
guard.init_semaphore.close();
|
||||
GuardMut(guard)
|
||||
Guard(guard)
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, if any.
|
||||
pub async fn get_mut(&self) -> Option<GuardMut<'_, T>> {
|
||||
let guard = self.inner.write().await;
|
||||
pub fn get(&self) -> Option<Guard<'_, T>> {
|
||||
let guard = self.inner.lock().unwrap();
|
||||
if guard.value.is_some() {
|
||||
Some(GuardMut(guard))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a guard to an existing initialized value, if any.
|
||||
pub async fn get(&self) -> Option<GuardRef<'_, T>> {
|
||||
let guard = self.inner.read().await;
|
||||
if guard.value.is_some() {
|
||||
Some(GuardRef(guard))
|
||||
Some(Guard(guard))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -221,9 +168,9 @@ impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
|
||||
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
|
||||
/// initialized value.
|
||||
#[derive(Debug)]
|
||||
pub struct GuardMut<'a, T>(tokio::sync::RwLockWriteGuard<'a, Inner<T>>);
|
||||
pub struct Guard<'a, T>(MutexGuard<'a, Inner<T>>);
|
||||
|
||||
impl<T> std::ops::Deref for GuardMut<'_, T> {
|
||||
impl<T> std::ops::Deref for Guard<'_, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@@ -234,7 +181,7 @@ impl<T> std::ops::Deref for GuardMut<'_, T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> std::ops::DerefMut for GuardMut<'_, T> {
|
||||
impl<T> std::ops::DerefMut for Guard<'_, T> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.0
|
||||
.value
|
||||
@@ -243,7 +190,7 @@ impl<T> std::ops::DerefMut for GuardMut<'_, T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> GuardMut<'a, T> {
|
||||
impl<'a, T> Guard<'a, T> {
|
||||
/// Take the current value, and a new permit for it's deinitialization.
|
||||
///
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
@@ -261,24 +208,6 @@ impl<'a, T> GuardMut<'a, T> {
|
||||
.map(|v| (v, InitPermit(permit)))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
|
||||
pub fn downgrade(self) -> GuardRef<'a, T> {
|
||||
GuardRef(self.0.downgrade())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct GuardRef<'a, T>(tokio::sync::RwLockReadGuard<'a, Inner<T>>);
|
||||
|
||||
impl<T> std::ops::Deref for GuardRef<'_, T> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0
|
||||
.value
|
||||
.as_ref()
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
/// Type held by OnceCell (de)initializing task.
|
||||
@@ -319,7 +248,7 @@ mod tests {
|
||||
barrier.wait().await;
|
||||
let won = {
|
||||
let g = cell
|
||||
.get_mut_or_init(|permit| {
|
||||
.get_or_init(|permit| {
|
||||
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
|
||||
async {
|
||||
counters.future_polled.fetch_add(1, Ordering::Relaxed);
|
||||
@@ -366,11 +295,7 @@ mod tests {
|
||||
let cell = cell.clone();
|
||||
let deinitialization_started = deinitialization_started.clone();
|
||||
async move {
|
||||
let (answer, _permit) = cell
|
||||
.get_mut()
|
||||
.await
|
||||
.expect("initialized to value")
|
||||
.take_and_deinit();
|
||||
let (answer, _permit) = cell.get().expect("initialized to value").take_and_deinit();
|
||||
assert_eq!(answer, initial);
|
||||
|
||||
deinitialization_started.wait().await;
|
||||
@@ -381,7 +306,7 @@ mod tests {
|
||||
deinitialization_started.wait().await;
|
||||
|
||||
let started_at = tokio::time::Instant::now();
|
||||
cell.get_mut_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
||||
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -393,21 +318,21 @@ mod tests {
|
||||
|
||||
jh.await.unwrap();
|
||||
|
||||
assert_eq!(*cell.get_mut().await.unwrap(), reinit);
|
||||
assert_eq!(*cell.get().unwrap(), reinit);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reinit_with_deinit_permit() {
|
||||
#[test]
|
||||
fn reinit_with_deinit_permit() {
|
||||
let cell = Arc::new(OnceCell::new(42));
|
||||
|
||||
let (mol, permit) = cell.get_mut().await.unwrap().take_and_deinit();
|
||||
cell.set(5, permit).await;
|
||||
assert_eq!(*cell.get_mut().await.unwrap(), 5);
|
||||
let (mol, permit) = cell.get().unwrap().take_and_deinit();
|
||||
cell.set(5, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 5);
|
||||
|
||||
let (five, permit) = cell.get_mut().await.unwrap().take_and_deinit();
|
||||
let (five, permit) = cell.get().unwrap().take_and_deinit();
|
||||
assert_eq!(5, five);
|
||||
cell.set(mol, permit).await;
|
||||
assert_eq!(*cell.get_mut().await.unwrap(), 42);
|
||||
cell.set(mol, permit);
|
||||
assert_eq!(*cell.get().unwrap(), 42);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -415,13 +340,13 @@ mod tests {
|
||||
let cell = OnceCell::default();
|
||||
|
||||
for _ in 0..10 {
|
||||
cell.get_mut_or_init(|_permit| async { Err("whatever error") })
|
||||
cell.get_or_init(|_permit| async { Err("whatever error") })
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
|
||||
let g = cell
|
||||
.get_mut_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "finally success");
|
||||
@@ -433,7 +358,7 @@ mod tests {
|
||||
|
||||
let barrier = tokio::sync::Barrier::new(2);
|
||||
|
||||
let initializer = cell.get_mut_or_init(|permit| async {
|
||||
let initializer = cell.get_or_init(|permit| async {
|
||||
barrier.wait().await;
|
||||
futures::future::pending::<()>().await;
|
||||
|
||||
@@ -447,10 +372,10 @@ mod tests {
|
||||
|
||||
// now initializer is dropped
|
||||
|
||||
assert!(cell.get_mut().await.is_none());
|
||||
assert!(cell.get().is_none());
|
||||
|
||||
let g = cell
|
||||
.get_mut_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
||||
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(*g, "now initialized");
|
||||
|
||||
@@ -20,13 +20,13 @@
|
||||
//!
|
||||
//! // Then, in the main code:
|
||||
//!
|
||||
//! let span = tracing::info_span!("TestSpan", tenant_id = 1);
|
||||
//! let span = tracing::info_span!("TestSpan", test_id = 1);
|
||||
//! let _guard = span.enter();
|
||||
//!
|
||||
//! // ... down the call stack
|
||||
//!
|
||||
//! use utils::tracing_span_assert::{check_fields_present, ConstExtractor};
|
||||
//! let extractor = ConstExtractor::new("tenant_id");
|
||||
//! use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
//! let extractor = MultiNameExtractor::new("TestExtractor", ["test", "test_id"]);
|
||||
//! if let Err(missing) = check_fields_present!([&extractor]) {
|
||||
//! // if you copypaste this to a custom assert method, remember to add #[track_caller]
|
||||
//! // to get the "user" code location for the panic.
|
||||
@@ -45,26 +45,27 @@ pub enum ExtractionResult {
|
||||
}
|
||||
|
||||
pub trait Extractor: Send + Sync + std::fmt::Debug {
|
||||
fn id(&self) -> &str;
|
||||
fn name(&self) -> &str;
|
||||
fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ConstExtractor {
|
||||
field_name: &'static str,
|
||||
pub struct MultiNameExtractor<const L: usize> {
|
||||
name: &'static str,
|
||||
field_names: [&'static str; L],
|
||||
}
|
||||
|
||||
impl ConstExtractor {
|
||||
pub const fn new(field_name: &'static str) -> ConstExtractor {
|
||||
ConstExtractor { field_name }
|
||||
impl<const L: usize> MultiNameExtractor<L> {
|
||||
pub fn new(name: &'static str, field_names: [&'static str; L]) -> MultiNameExtractor<L> {
|
||||
MultiNameExtractor { name, field_names }
|
||||
}
|
||||
}
|
||||
impl Extractor for ConstExtractor {
|
||||
fn id(&self) -> &str {
|
||||
self.field_name
|
||||
impl<const L: usize> Extractor for MultiNameExtractor<L> {
|
||||
fn name(&self) -> &str {
|
||||
self.name
|
||||
}
|
||||
fn extract(&self, fields: &tracing::field::FieldSet) -> ExtractionResult {
|
||||
if fields.iter().any(|f| f.name() == self.field_name) {
|
||||
if fields.iter().any(|f| self.field_names.contains(&f.name())) {
|
||||
ExtractionResult::Present
|
||||
} else {
|
||||
ExtractionResult::Absent
|
||||
@@ -202,19 +203,19 @@ mod tests {
|
||||
}
|
||||
impl<'a> fmt::Debug for MemoryIdentity<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
|
||||
write!(f, "{:p}: {}", self.as_ptr(), self.0.name())
|
||||
}
|
||||
}
|
||||
|
||||
struct Setup {
|
||||
_current_thread_subscriber_guard: tracing::subscriber::DefaultGuard,
|
||||
tenant_extractor: ConstExtractor,
|
||||
timeline_extractor: ConstExtractor,
|
||||
tenant_extractor: MultiNameExtractor<2>,
|
||||
timeline_extractor: MultiNameExtractor<2>,
|
||||
}
|
||||
|
||||
fn setup_current_thread() -> Setup {
|
||||
let tenant_extractor = ConstExtractor::new("tenant_id");
|
||||
let timeline_extractor = ConstExtractor::new("timeline_id");
|
||||
let tenant_extractor = MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]);
|
||||
let timeline_extractor = MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"]);
|
||||
|
||||
let registry = tracing_subscriber::registry()
|
||||
.with(tracing_subscriber::fmt::layer())
|
||||
@@ -342,12 +343,12 @@ mod tests {
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractor = ConstExtractor::new("e");
|
||||
let extractor = MultiNameExtractor::new("E", ["e"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = ConstExtractor::new("foobar");
|
||||
let extractor = MultiNameExtractor::new("F", ["foobar"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
}
|
||||
@@ -367,14 +368,16 @@ mod tests {
|
||||
// normally this would work, but without any tracing-subscriber configured, both
|
||||
// check_field_present find nothing
|
||||
let _guard = subspan.enter();
|
||||
let extractors: [&dyn Extractor; 2] =
|
||||
[&ConstExtractor::new("e"), &ConstExtractor::new("f")];
|
||||
let extractors: [&dyn Extractor; 2] = [
|
||||
&MultiNameExtractor::new("E", ["e"]),
|
||||
&MultiNameExtractor::new("F", ["f"]),
|
||||
];
|
||||
|
||||
let res = check_fields_present0(extractors);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
|
||||
// similarly for a not found key
|
||||
let extractor = ConstExtractor::new("g");
|
||||
let extractor = MultiNameExtractor::new("G", ["g"]);
|
||||
let res = check_fields_present0([&extractor]);
|
||||
assert!(matches!(res, Ok(Summary::Unconfigured)), "{res:?}");
|
||||
}
|
||||
@@ -407,7 +410,7 @@ mod tests {
|
||||
let span = tracing::info_span!("foo", e = "some value");
|
||||
let _guard = span.enter();
|
||||
|
||||
let extractors: [&dyn Extractor; 1] = [&ConstExtractor::new("e")];
|
||||
let extractors: [&dyn Extractor; 1] = [&MultiNameExtractor::new("E", ["e"])];
|
||||
|
||||
if span.is_disabled() {
|
||||
// the tests are running single threaded, or we got lucky and no other tests subscriber
|
||||
|
||||
@@ -79,12 +79,6 @@ impl KeyRange {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, Hash, Copy, Clone)]
|
||||
struct WorkerId {
|
||||
timeline: TenantTimelineId,
|
||||
num_client: usize, // from 0..args.num_clients
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
struct Output {
|
||||
total: request_stats::Output,
|
||||
@@ -212,7 +206,7 @@ async fn main_impl(
|
||||
|
||||
let live_stats = Arc::new(LiveStats::default());
|
||||
|
||||
let num_client_tasks = args.num_clients.get() * timelines.len();
|
||||
let num_client_tasks = timelines.len();
|
||||
let num_live_stats_dump = 1;
|
||||
let num_work_sender_tasks = 1;
|
||||
let num_main_impl = 1;
|
||||
@@ -241,25 +235,19 @@ async fn main_impl(
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
|
||||
let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
|
||||
let mut tasks = Vec::new();
|
||||
for timeline in timelines.iter().cloned() {
|
||||
for num_client in 0..args.num_clients.get() {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
let worker_id = WorkerId {
|
||||
timeline,
|
||||
num_client,
|
||||
};
|
||||
work_senders.insert(worker_id, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
worker_id,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&live_stats),
|
||||
cancel.clone(),
|
||||
)));
|
||||
}
|
||||
for tl in &timelines {
|
||||
let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
|
||||
work_senders.insert(*tl, sender);
|
||||
tasks.push(tokio::spawn(client(
|
||||
args,
|
||||
*tl,
|
||||
Arc::clone(&start_work_barrier),
|
||||
receiver,
|
||||
Arc::clone(&live_stats),
|
||||
cancel.clone(),
|
||||
)));
|
||||
}
|
||||
|
||||
let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
|
||||
@@ -283,10 +271,7 @@ async fn main_impl(
|
||||
let (rel_tag, block_no) =
|
||||
key_to_rel_block(key).expect("we filter non-rel-block keys out above");
|
||||
(
|
||||
WorkerId {
|
||||
timeline: r.timeline,
|
||||
num_client: rng.gen_range(0..args.num_clients.get()),
|
||||
},
|
||||
r.timeline,
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
@@ -304,54 +289,56 @@ async fn main_impl(
|
||||
}),
|
||||
Some(rps_limit) => Box::pin(async move {
|
||||
let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
|
||||
let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
|
||||
&|worker_id| {
|
||||
let sender = work_senders.get(&worker_id).unwrap();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == worker_id.timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
let make_timeline_task: &dyn Fn(
|
||||
TenantTimelineId,
|
||||
)
|
||||
-> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
|
||||
let sender = work_senders.get(&timeline).unwrap();
|
||||
let ranges: Vec<KeyRange> = all_ranges
|
||||
.iter()
|
||||
.filter(|r| r.timeline == timeline)
|
||||
.cloned()
|
||||
.collect();
|
||||
let weights = rand::distributions::weighted::WeightedIndex::new(
|
||||
ranges.iter().map(|v| v.len()),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let cancel = cancel.clone();
|
||||
Box::pin(async move {
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(
|
||||
/* TODO review this choice */
|
||||
tokio::time::MissedTickBehavior::Burst,
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
ticker.tick().await;
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(
|
||||
cancel.is_cancelled(),
|
||||
"client has gone away unexpectedly"
|
||||
);
|
||||
let cancel = cancel.clone();
|
||||
Box::pin(async move {
|
||||
let mut ticker = tokio::time::interval(period);
|
||||
ticker.set_missed_tick_behavior(
|
||||
/* TODO review this choice */
|
||||
tokio::time::MissedTickBehavior::Burst,
|
||||
);
|
||||
while !cancel.is_cancelled() {
|
||||
ticker.tick().await;
|
||||
let req = {
|
||||
let mut rng = rand::thread_rng();
|
||||
let r = &ranges[weights.sample(&mut rng)];
|
||||
let key: i128 = rng.gen_range(r.start..r.end);
|
||||
let key = Key::from_i128(key);
|
||||
assert!(is_rel_block_key(&key));
|
||||
let (rel_tag, block_no) = key_to_rel_block(key)
|
||||
.expect("we filter non-rel-block keys out above");
|
||||
PagestreamGetPageRequest {
|
||||
latest: rng.gen_bool(args.req_latest_probability),
|
||||
lsn: r.timeline_lsn,
|
||||
rel: rel_tag,
|
||||
blkno: block_no,
|
||||
}
|
||||
};
|
||||
if sender.send(req).await.is_err() {
|
||||
assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
|
||||
}
|
||||
})
|
||||
};
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
|
||||
let tasks: Vec<_> = work_senders
|
||||
.keys()
|
||||
.map(|tl| make_timeline_task(*tl))
|
||||
.collect();
|
||||
|
||||
start_work_barrier.wait().await;
|
||||
|
||||
@@ -403,16 +390,12 @@ async fn main_impl(
|
||||
#[instrument(skip_all)]
|
||||
async fn client(
|
||||
args: &'static Args,
|
||||
id: WorkerId,
|
||||
timeline: TenantTimelineId,
|
||||
start_work_barrier: Arc<Barrier>,
|
||||
mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
|
||||
live_stats: Arc<LiveStats>,
|
||||
cancel: CancellationToken,
|
||||
) {
|
||||
let WorkerId {
|
||||
timeline,
|
||||
num_client: _,
|
||||
} = id;
|
||||
let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -262,33 +262,35 @@ async fn upload(
|
||||
) -> Result<(), UploadError> {
|
||||
let warn_after = 3;
|
||||
let max_attempts = 10;
|
||||
|
||||
// this is used only with tests so far
|
||||
let last_value = if is_last { "true" } else { "false" };
|
||||
|
||||
let res = utils::backoff::retry(
|
||||
|| async {
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.header(reqwest::header::CONTENT_TYPE, "application/json")
|
||||
.header(LAST_IN_BATCH.clone(), last_value)
|
||||
.body(body.clone())
|
||||
.send()
|
||||
.await;
|
||||
move || {
|
||||
let body = body.clone();
|
||||
async move {
|
||||
let res = client
|
||||
.post(metric_collection_endpoint.clone())
|
||||
.header(reqwest::header::CONTENT_TYPE, "application/json")
|
||||
.header(
|
||||
LAST_IN_BATCH.clone(),
|
||||
if is_last { "true" } else { "false" },
|
||||
)
|
||||
.body(body)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
let res = res.and_then(|res| res.error_for_status());
|
||||
let res = res.and_then(|res| res.error_for_status());
|
||||
|
||||
// 10 redirects are normally allowed, so we don't need worry about 3xx
|
||||
match res {
|
||||
Ok(_response) => Ok(()),
|
||||
Err(e) => {
|
||||
let status = e.status().filter(|s| s.is_client_error());
|
||||
if let Some(status) = status {
|
||||
// rejection used to be a thing when the server could reject a
|
||||
// whole batch of metrics if one metric was bad.
|
||||
Err(UploadError::Rejected(status))
|
||||
} else {
|
||||
Err(UploadError::Reqwest(e))
|
||||
// 10 redirects are normally allowed, so we don't need worry about 3xx
|
||||
match res {
|
||||
Ok(_response) => Ok(()),
|
||||
Err(e) => {
|
||||
let status = e.status().filter(|s| s.is_client_error());
|
||||
if let Some(status) = status {
|
||||
// rejection used to be a thing when the server could reject a
|
||||
// whole batch of metrics if one metric was bad.
|
||||
Err(UploadError::Rejected(status))
|
||||
} else {
|
||||
Err(UploadError::Reqwest(e))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -297,11 +299,9 @@ async fn upload(
|
||||
warn_after,
|
||||
max_attempts,
|
||||
"upload consumption_metrics",
|
||||
cancel,
|
||||
utils::backoff::Cancel::new(cancel.clone(), || UploadError::Cancelled),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UploadError::Cancelled)
|
||||
.and_then(|x| x);
|
||||
.await;
|
||||
|
||||
match &res {
|
||||
Ok(_) => {}
|
||||
|
||||
@@ -82,29 +82,46 @@ impl ControlPlaneClient {
|
||||
R: Serialize,
|
||||
T: DeserializeOwned,
|
||||
{
|
||||
let res = backoff::retry(
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
enum RemoteAttemptError {
|
||||
#[error("shutdown")]
|
||||
Shutdown,
|
||||
#[error("remote: {0}")]
|
||||
Remote(reqwest::Error),
|
||||
}
|
||||
|
||||
match backoff::retry(
|
||||
|| async {
|
||||
let response = self
|
||||
.http_client
|
||||
.post(url.clone())
|
||||
.json(&request)
|
||||
.send()
|
||||
.await?;
|
||||
.await
|
||||
.map_err(RemoteAttemptError::Remote)?;
|
||||
|
||||
response.error_for_status_ref()?;
|
||||
response.json::<T>().await
|
||||
response
|
||||
.error_for_status_ref()
|
||||
.map_err(RemoteAttemptError::Remote)?;
|
||||
response
|
||||
.json::<T>()
|
||||
.await
|
||||
.map_err(RemoteAttemptError::Remote)
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
u32::MAX,
|
||||
"calling control plane generation validation API",
|
||||
&self.cancel,
|
||||
backoff::Cancel::new(self.cancel.clone(), || RemoteAttemptError::Shutdown),
|
||||
)
|
||||
.await
|
||||
.ok_or(RetryForeverError::ShuttingDown)?
|
||||
.expect("We retry forever, this should never be reached");
|
||||
|
||||
Ok(res)
|
||||
{
|
||||
Err(RemoteAttemptError::Shutdown) => Err(RetryForeverError::ShuttingDown),
|
||||
Err(RemoteAttemptError::Remote(_)) => {
|
||||
panic!("We retry forever, this should never be reached");
|
||||
}
|
||||
Ok(r) => Ok(r),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -700,6 +700,8 @@ impl DeletionQueue {
|
||||
}
|
||||
|
||||
pub async fn shutdown(&mut self, timeout: Duration) {
|
||||
self.cancel.cancel();
|
||||
|
||||
match tokio::time::timeout(timeout, self.client.flush()).await {
|
||||
Ok(Ok(())) => {
|
||||
tracing::info!("Deletion queue flushed successfully on shutdown")
|
||||
@@ -713,10 +715,6 @@ impl DeletionQueue {
|
||||
tracing::warn!("Timed out flushing deletion queue on shutdown")
|
||||
}
|
||||
}
|
||||
|
||||
// We only cancel _after_ flushing: otherwise we would be shutting down the
|
||||
// components that do the flush.
|
||||
self.cancel.cancel();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -77,11 +77,9 @@ impl Deleter {
|
||||
3,
|
||||
10,
|
||||
"executing deletion batch",
|
||||
&self.cancel,
|
||||
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Shutting down"))
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
/// Block until everything in accumulator has been executed
|
||||
|
||||
@@ -489,12 +489,6 @@ async fn timeline_create_handler(
|
||||
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
|
||||
if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
|
||||
tracing::info!(%ancestor_id, "starting to branch");
|
||||
} else {
|
||||
tracing::info!("bootstrapping");
|
||||
}
|
||||
|
||||
match tenant.create_timeline(
|
||||
new_timeline_id,
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
@@ -535,7 +529,7 @@ async fn timeline_create_handler(
|
||||
}
|
||||
.instrument(info_span!("timeline_create",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
shard = %tenant_shard_id.shard_slug(),
|
||||
timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await
|
||||
}
|
||||
@@ -831,7 +825,7 @@ async fn timeline_delete_handler(
|
||||
}
|
||||
})?;
|
||||
tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
|
||||
tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
|
||||
tenant.delete_timeline(timeline_id).instrument(info_span!("timeline_delete", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug(), %timeline_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
@@ -856,7 +850,7 @@ async fn tenant_detach_handler(
|
||||
detach_ignored.unwrap_or(false),
|
||||
&state.deletion_queue_client,
|
||||
)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
||||
.instrument(info_span!("tenant_detach", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
@@ -1007,7 +1001,7 @@ async fn tenant_delete_handler(
|
||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
shard = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await?;
|
||||
|
||||
@@ -1363,7 +1357,7 @@ async fn put_tenant_location_config_handler(
|
||||
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
shard = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await
|
||||
{
|
||||
|
||||
@@ -17,7 +17,6 @@ pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod repository;
|
||||
pub mod span;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
pub mod tenant;
|
||||
|
||||
@@ -63,10 +63,9 @@ use crate::import_datadir::import_wal_from_tar;
|
||||
use crate::metrics;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::pgdatadir_mapping::Version;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::mgr;
|
||||
use crate::tenant::mgr::get_active_tenant_with_timeout;
|
||||
use crate::tenant::mgr::GetActiveTenantError;
|
||||
@@ -550,7 +549,7 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let tenant = mgr::get_active_tenant_with_timeout(
|
||||
tenant_id,
|
||||
@@ -632,7 +631,6 @@ impl PageServerHandler {
|
||||
)
|
||||
}
|
||||
PagestreamFeMessage::GetPage(req) => {
|
||||
// shard_id is filled in by the handler
|
||||
let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
|
||||
(
|
||||
self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
|
||||
@@ -721,7 +719,7 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
@@ -774,7 +772,7 @@ impl PageServerHandler {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
|
||||
#[instrument(skip_all, fields(%start_lsn, %end_lsn))]
|
||||
async fn handle_import_wal<IO>(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -787,6 +785,8 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
@@ -893,7 +893,6 @@ impl PageServerHandler {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_rel_exists_request(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
@@ -920,7 +919,6 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_nblocks_request(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
@@ -948,7 +946,6 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_db_size_request(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
@@ -1099,7 +1096,6 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
@@ -1133,9 +1129,6 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
// load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
|
||||
set_tracing_field_shard_id(timeline);
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
@@ -1154,7 +1147,6 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn handle_get_slru_segment_request(
|
||||
&mut self,
|
||||
tenant_id: TenantId,
|
||||
@@ -1183,7 +1175,7 @@ impl PageServerHandler {
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
|
||||
#[instrument(skip_all, fields(?lsn, ?prev_lsn, %full_backup))]
|
||||
async fn handle_basebackup_request<IO>(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend<IO>,
|
||||
@@ -1198,6 +1190,8 @@ impl PageServerHandler {
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
|
||||
// check that the timeline exists
|
||||
@@ -1319,7 +1313,6 @@ impl PageServerHandler {
|
||||
.await
|
||||
.map_err(GetActiveTimelineError::Tenant)?;
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
set_tracing_field_shard_id(&timeline);
|
||||
Ok(timeline)
|
||||
}
|
||||
}
|
||||
@@ -1484,29 +1477,21 @@ where
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
async {
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
let timeline = self
|
||||
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
|
||||
.await?;
|
||||
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::text_col(b"prev_lsn"),
|
||||
RowDescriptor::text_col(b"last_lsn"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(end_of_timeline.prev.to_string().as_bytes()),
|
||||
Some(end_of_timeline.last.to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
anyhow::Ok(())
|
||||
}
|
||||
.instrument(info_span!(
|
||||
"handle_get_last_record_lsn",
|
||||
shard_id = tracing::field::Empty
|
||||
))
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::text_col(b"prev_lsn"),
|
||||
RowDescriptor::text_col(b"last_lsn"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(end_of_timeline.prev.to_string().as_bytes()),
|
||||
Some(end_of_timeline.last.to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if query_string.starts_with("fullbackup ") {
|
||||
@@ -1763,12 +1748,3 @@ impl From<GetActiveTimelineError> for QueryError {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_tracing_field_shard_id(timeline: &Timeline) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
tracing::Span::current().record(
|
||||
"shard_id",
|
||||
tracing::field::display(timeline.tenant_shard_id.shard_slug()),
|
||||
);
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
}
|
||||
|
||||
@@ -10,7 +10,6 @@ use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::repository::*;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
@@ -700,7 +699,7 @@ impl Timeline {
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
|
||||
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
use utils::tracing_span_assert::check_fields_present;
|
||||
|
||||
mod extractors {
|
||||
use utils::tracing_span_assert::ConstExtractor;
|
||||
|
||||
pub(super) const TENANT_ID: ConstExtractor = ConstExtractor::new("tenant_id");
|
||||
pub(super) const SHARD_ID: ConstExtractor = ConstExtractor::new("shard_id");
|
||||
pub(super) const TIMELINE_ID: ConstExtractor = ConstExtractor::new("timeline_id");
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
||||
if cfg!(debug_assertions) {
|
||||
if let Err(missing) = check_fields_present!([&extractors::TENANT_ID, &extractors::SHARD_ID])
|
||||
{
|
||||
panic!("missing extractors: {missing:?}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
|
||||
if cfg!(debug_assertions) {
|
||||
if let Err(missing) = check_fields_present!([
|
||||
&extractors::TENANT_ID,
|
||||
&extractors::SHARD_ID,
|
||||
&extractors::TIMELINE_ID,
|
||||
]) {
|
||||
panic!("missing extractors: {missing:?}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id() {
|
||||
if cfg!(debug_assertions) {
|
||||
if let Err(missing) =
|
||||
check_fields_present!([&extractors::TENANT_ID, &extractors::TIMELINE_ID,])
|
||||
{
|
||||
panic!("missing extractors: {missing:?}")
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -67,9 +67,7 @@ use crate::deletion_queue::DeletionQueueError;
|
||||
use crate::import_datadir;
|
||||
use crate::is_uninit_mark;
|
||||
use crate::metrics::TENANT;
|
||||
use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
@@ -100,7 +98,6 @@ use std::sync::Arc;
|
||||
use std::sync::{Mutex, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::span;
|
||||
use crate::tenant::timeline::delete::DeleteTimelineFlow;
|
||||
use crate::tenant::timeline::uninit::cleanup_timeline_directory;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
@@ -151,6 +148,7 @@ pub mod block_io;
|
||||
pub mod disk_btree;
|
||||
pub(crate) mod ephemeral_file;
|
||||
pub mod layer_map;
|
||||
mod span;
|
||||
|
||||
pub mod metadata;
|
||||
mod par_fsync;
|
||||
@@ -168,7 +166,7 @@ pub(crate) mod timeline;
|
||||
|
||||
pub mod size;
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
|
||||
|
||||
// re-export for use in remote_timeline_client.rs
|
||||
@@ -207,7 +205,7 @@ impl AttachedTenantConf {
|
||||
match &location_conf.mode {
|
||||
LocationMode::Attached(attach_conf) => Ok(Self {
|
||||
tenant_conf: location_conf.tenant_conf,
|
||||
location: *attach_conf,
|
||||
location: attach_conf.clone(),
|
||||
}),
|
||||
LocationMode::Secondary(_) => {
|
||||
anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
|
||||
@@ -278,7 +276,7 @@ pub struct Tenant {
|
||||
// with timelines, which in turn may cause dropping replication connection, expiration of wait_for_lsn
|
||||
// timeout...
|
||||
gc_cs: tokio::sync::Mutex<()>,
|
||||
walredo_mgr: Option<Arc<WalRedoManager>>,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
|
||||
// provides access to timeline data sitting in the remote storage
|
||||
pub(crate) remote_storage: Option<GenericRemoteStorage>,
|
||||
@@ -627,15 +625,12 @@ impl Tenant {
|
||||
deletion_queue_client,
|
||||
} = resources;
|
||||
|
||||
let attach_mode = attached_conf.location.attach_mode;
|
||||
let generation = attached_conf.location.generation;
|
||||
|
||||
let tenant = Arc::new(Tenant::new(
|
||||
TenantState::Attaching,
|
||||
conf,
|
||||
attached_conf,
|
||||
shard_identity,
|
||||
Some(wal_redo_manager),
|
||||
wal_redo_manager,
|
||||
tenant_shard_id,
|
||||
remote_storage.clone(),
|
||||
deletion_queue_client,
|
||||
@@ -659,12 +654,6 @@ impl Tenant {
|
||||
"attach tenant",
|
||||
false,
|
||||
async move {
|
||||
|
||||
info!(
|
||||
?attach_mode,
|
||||
"Attaching tenant"
|
||||
);
|
||||
|
||||
let _gate_guard = attach_gate_guard;
|
||||
|
||||
// Is this tenant being spawned as part of process startup?
|
||||
@@ -876,7 +865,7 @@ impl Tenant {
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), gen=?generation);
|
||||
let span = tracing::info_span!(parent: None, "attach", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
@@ -1195,6 +1184,10 @@ impl Tenant {
|
||||
tenant_shard_id: TenantShardId,
|
||||
reason: String,
|
||||
) -> Arc<Tenant> {
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
)));
|
||||
Arc::new(Tenant::new(
|
||||
TenantState::Broken {
|
||||
reason,
|
||||
@@ -1205,7 +1198,7 @@ impl Tenant {
|
||||
// Shard identity isn't meaningful for a broken tenant: it's just a placeholder
|
||||
// to occupy the slot for this TenantShardId.
|
||||
ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
|
||||
None,
|
||||
wal_redo_manager,
|
||||
tenant_shard_id,
|
||||
None,
|
||||
DeletionQueueClient::broken(),
|
||||
@@ -1974,7 +1967,7 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub(crate) fn wal_redo_manager_status(&self) -> Option<WalRedoManagerStatus> {
|
||||
self.walredo_mgr.as_ref().and_then(|mgr| mgr.status())
|
||||
self.walredo_mgr.status()
|
||||
}
|
||||
|
||||
/// Changes tenant status to active, unless shutdown was already requested.
|
||||
@@ -2361,7 +2354,12 @@ impl Tenant {
|
||||
}
|
||||
|
||||
pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
|
||||
self.tenant_conf.read().unwrap().location.attach_mode
|
||||
self.tenant_conf
|
||||
.read()
|
||||
.unwrap()
|
||||
.location
|
||||
.attach_mode
|
||||
.clone()
|
||||
}
|
||||
|
||||
/// For API access: generate a LocationConfig equivalent to the one that would be used to
|
||||
@@ -2609,7 +2607,7 @@ impl Tenant {
|
||||
self.tenant_shard_id,
|
||||
self.generation,
|
||||
self.shard_identity,
|
||||
self.walredo_mgr.as_ref().map(Arc::clone),
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
resources,
|
||||
pg_version,
|
||||
state,
|
||||
@@ -2627,7 +2625,7 @@ impl Tenant {
|
||||
conf: &'static PageServerConf,
|
||||
attached_conf: AttachedTenantConf,
|
||||
shard_identity: ShardIdentity,
|
||||
walredo_mgr: Option<Arc<WalRedoManager>>,
|
||||
walredo_mgr: Arc<WalRedoManager>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
@@ -2635,16 +2633,9 @@ impl Tenant {
|
||||
let (state, mut rx) = watch::channel(state);
|
||||
|
||||
tokio::spawn(async move {
|
||||
// reflect tenant state in metrics:
|
||||
// - global per tenant state: TENANT_STATE_METRIC
|
||||
// - "set" of broken tenants: BROKEN_TENANTS_SET
|
||||
//
|
||||
// set of broken tenants should not have zero counts so that it remains accessible for
|
||||
// alerting.
|
||||
|
||||
// Strings for metric labels
|
||||
let tid = tenant_shard_id.to_string();
|
||||
let shard_id = tenant_shard_id.shard_slug().to_string();
|
||||
let set_key = &[tid.as_str(), shard_id.as_str()][..];
|
||||
let shard_id_str = format!("{}", tenant_shard_id.shard_slug());
|
||||
|
||||
fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
|
||||
([state.into()], matches!(state, TenantState::Broken { .. }))
|
||||
@@ -2653,13 +2644,21 @@ impl Tenant {
|
||||
let mut tuple = inspect_state(&rx.borrow_and_update());
|
||||
|
||||
let is_broken = tuple.1;
|
||||
let mut counted_broken = if is_broken {
|
||||
// add the id to the set right away, there should not be any updates on the channel
|
||||
// after before tenant is removed, if ever
|
||||
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
|
||||
true
|
||||
} else {
|
||||
let mut counted_broken = if !is_broken {
|
||||
// the tenant might be ignored and reloaded, so first remove any previous set
|
||||
// element. it most likely has already been scraped, as these are manual operations
|
||||
// right now. most likely we will add it back very soon.
|
||||
drop(
|
||||
crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
|
||||
);
|
||||
false
|
||||
} else {
|
||||
// add the id to the set right away, there should not be any updates on the channel
|
||||
// after
|
||||
crate::metrics::BROKEN_TENANTS_SET
|
||||
.with_label_values(&[&tid, &shard_id_str])
|
||||
.set(1);
|
||||
true
|
||||
};
|
||||
|
||||
loop {
|
||||
@@ -2668,9 +2667,10 @@ impl Tenant {
|
||||
current.inc();
|
||||
|
||||
if rx.changed().await.is_err() {
|
||||
// tenant has been dropped
|
||||
// tenant has been dropped; decrement the counter because a tenant with that
|
||||
// state is no longer in tenant map, but allow any broken set item to exist
|
||||
// still.
|
||||
current.dec();
|
||||
drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -2680,9 +2680,10 @@ impl Tenant {
|
||||
let is_broken = tuple.1;
|
||||
if is_broken && !counted_broken {
|
||||
counted_broken = true;
|
||||
// insert the tenant_id (back) into the set while avoiding needless counter
|
||||
// access
|
||||
BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
|
||||
// insert the tenant_id (back) into the set
|
||||
crate::metrics::BROKEN_TENANTS_SET
|
||||
.with_label_values(&[&tid, &shard_id_str])
|
||||
.inc();
|
||||
}
|
||||
}
|
||||
});
|
||||
@@ -3224,6 +3225,8 @@ impl Tenant {
|
||||
.context("branch initial metadata upload")?;
|
||||
}
|
||||
|
||||
info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
|
||||
|
||||
Ok(new_timeline)
|
||||
}
|
||||
|
||||
@@ -3290,11 +3293,11 @@ impl Tenant {
|
||||
3,
|
||||
u32::MAX,
|
||||
"persist_initdb_tar_zst",
|
||||
&self.cancel,
|
||||
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.and_then(|x| x)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// - run initdb to init temporary instance and get bootstrap data
|
||||
@@ -3441,6 +3444,12 @@ impl Tenant {
|
||||
// All done!
|
||||
let timeline = raw_timeline.finish_creation()?;
|
||||
|
||||
info!(
|
||||
"created root timeline {} timeline.lsn {}",
|
||||
timeline_id,
|
||||
timeline.get_last_record_lsn()
|
||||
);
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
@@ -3994,10 +4003,6 @@ pub(crate) mod harness {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn span(&self) -> tracing::Span {
|
||||
info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
||||
}
|
||||
|
||||
pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
|
||||
(
|
||||
@@ -4051,7 +4056,7 @@ pub(crate) mod harness {
|
||||
.unwrap(),
|
||||
// This is a legacy/test code path: sharding isn't supported here.
|
||||
ShardIdentity::unsharded(),
|
||||
Some(walredo_mgr),
|
||||
walredo_mgr,
|
||||
self.tenant_shard_id,
|
||||
Some(self.remote_storage.clone()),
|
||||
self.deletion_queue.new_client(),
|
||||
@@ -4602,7 +4607,7 @@ mod tests {
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(harness.span())
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
@@ -4643,7 +4648,7 @@ mod tests {
|
||||
// so that all uploads finish & we can call harness.load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(harness.span())
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
@@ -4705,7 +4710,7 @@ mod tests {
|
||||
// so that all uploads finish & we can call harness.try_load() below again
|
||||
tenant
|
||||
.shutdown(Default::default(), true)
|
||||
.instrument(harness.span())
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%tenant.tenant_shard_id))
|
||||
.await
|
||||
.ok()
|
||||
.unwrap();
|
||||
@@ -5238,7 +5243,7 @@ mod tests {
|
||||
let raw_tline = tline.raw_timeline().unwrap();
|
||||
raw_tline
|
||||
.shutdown()
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID))
|
||||
.instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, timeline_id=%TIMELINE_ID))
|
||||
.await;
|
||||
std::mem::forget(tline);
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub(crate) enum AttachmentMode {
|
||||
/// Our generation is current as far as we know, and as far as we know we are the only attached
|
||||
/// pageserver. This is the "normal" attachment mode.
|
||||
@@ -66,7 +66,7 @@ pub(crate) enum AttachmentMode {
|
||||
Stale,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub(crate) struct AttachedLocationConfig {
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) attach_mode: AttachmentMode,
|
||||
|
||||
@@ -91,11 +91,9 @@ async fn create_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.and_then(|x| x)
|
||||
.context("mark_upload")?;
|
||||
|
||||
Ok(())
|
||||
@@ -189,11 +187,9 @@ async fn remove_tenant_remote_delete_mark(
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"remove_tenant_remote_delete_mark",
|
||||
cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.and_then(|x| x)
|
||||
.context("remove_tenant_remote_delete_mark")?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -607,6 +607,13 @@ pub(crate) fn tenant_spawn(
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
generation = ?location_conf.location.generation,
|
||||
attach_mode = ?location_conf.location.attach_mode,
|
||||
"Attaching tenant"
|
||||
);
|
||||
let tenant = match Tenant::spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
@@ -684,7 +691,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
// going to log too many lines
|
||||
debug!("tenant successfully stopped");
|
||||
}
|
||||
.instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
|
||||
.instrument(info_span!("shutdown", tenant_id=%tenant_shard_id.tenant_id, shard=%tenant_shard_id.shard_slug())),
|
||||
);
|
||||
|
||||
total_attached += 1;
|
||||
@@ -1720,7 +1727,6 @@ pub(crate) async fn ignore_tenant(
|
||||
ignore_tenant0(conf, &TENANTS, tenant_id).await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn ignore_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &std::sync::RwLock<TenantsMap>,
|
||||
@@ -1728,10 +1734,6 @@ async fn ignore_tenant0(
|
||||
) -> Result<(), TenantStateError> {
|
||||
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
tracing::Span::current().record(
|
||||
"shard_id",
|
||||
tracing::field::display(tenant_shard_id.shard_slug()),
|
||||
);
|
||||
|
||||
remove_tenant_from_memory(tenants, tenant_shard_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||
@@ -2127,7 +2129,7 @@ fn tenant_map_acquire_slot_impl(
|
||||
METRICS.tenant_slot_writes.inc();
|
||||
|
||||
let mut locked = tenants.write().unwrap();
|
||||
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
|
||||
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
|
||||
let _guard = span.enter();
|
||||
|
||||
let m = match &mut *locked {
|
||||
@@ -2363,7 +2365,7 @@ pub(crate) async fn immediate_gc(
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
use tracing::Instrument;
|
||||
use tracing::{info_span, Instrument};
|
||||
|
||||
use crate::tenant::mgr::TenantSlot;
|
||||
|
||||
@@ -2374,16 +2376,17 @@ mod tests {
|
||||
// Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
|
||||
// wait for it to complete before proceeding.
|
||||
|
||||
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
|
||||
let (t, _ctx) = h.load().await;
|
||||
let (t, _ctx) = TenantHarness::create("shutdown_awaits_in_progress_tenant")
|
||||
.unwrap()
|
||||
.load()
|
||||
.await;
|
||||
|
||||
// harness loads it to active, which is forced and nothing is running on the tenant
|
||||
|
||||
let id = t.tenant_shard_id();
|
||||
|
||||
// tenant harness configures the logging and we cannot escape it
|
||||
let span = h.span();
|
||||
let _e = span.enter();
|
||||
let _e = info_span!("testing", tenant_id = %id).entered();
|
||||
|
||||
let tenants = BTreeMap::from([(id, TenantSlot::Attached(t.clone()))]);
|
||||
let tenants = Arc::new(std::sync::RwLock::new(TenantsMap::Open(tenants)));
|
||||
@@ -2404,7 +2407,7 @@ mod tests {
|
||||
};
|
||||
super::remove_tenant_from_memory(&tenants, id, cleanup).await
|
||||
}
|
||||
.instrument(h.span())
|
||||
.instrument(info_span!("foobar", tenant_id = %id))
|
||||
});
|
||||
|
||||
// now the long cleanup should be in place, with the stopping state
|
||||
|
||||
@@ -1046,11 +1046,9 @@ impl RemoteTimelineClient {
|
||||
// when executed as part of tenant deletion this happens in the background
|
||||
2,
|
||||
"persist_index_part_with_deleted_flag",
|
||||
&self.cancel,
|
||||
backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.and_then(|x| x)?;
|
||||
.await?;
|
||||
|
||||
// all good, disarm the guard and mark as success
|
||||
ScopeGuard::into_inner(undo_deleted_at);
|
||||
@@ -1085,11 +1083,9 @@ impl RemoteTimelineClient {
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"preserve_initdb_tar_zst",
|
||||
&cancel.clone(),
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancellled"))
|
||||
.and_then(|x| x)
|
||||
.context("backing up initdb archive")?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1145,8 +1141,6 @@ impl RemoteTimelineClient {
|
||||
// taking the burden of listing all the layers that we already know we should delete.
|
||||
self.deletion_queue_client.flush_immediate().await?;
|
||||
|
||||
let cancel = shutdown_token();
|
||||
|
||||
let remaining = backoff::retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
@@ -1157,11 +1151,9 @@ impl RemoteTimelineClient {
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"list_prefixes",
|
||||
&cancel,
|
||||
backoff::Cancel::new(shutdown_token(), || anyhow::anyhow!("Cancelled!")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled!"))
|
||||
.and_then(|x| x)
|
||||
.context("list prefixes")?;
|
||||
|
||||
// We will delete the current index_part object last, since it acts as a deletion
|
||||
@@ -1952,7 +1944,6 @@ mod tests {
|
||||
tracing::info_span!(
|
||||
"test",
|
||||
tenant_id = %self.harness.tenant_shard_id.tenant_id,
|
||||
shard_id = %self.harness.tenant_shard_id.shard_slug(),
|
||||
timeline_id = %TIMELINE_ID
|
||||
)
|
||||
}
|
||||
|
||||
@@ -17,11 +17,11 @@ use utils::timeout::timeout_cancellable;
|
||||
use utils::{backoff, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{
|
||||
download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
|
||||
};
|
||||
use crate::tenant::storage_layer::LayerFileName;
|
||||
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::Generation;
|
||||
use crate::virtual_file::on_fatal_io_error;
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
@@ -76,6 +76,7 @@ pub async fn download_layer_file<'a>(
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path = path_with_suffix_extension(&local_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let cancel_inner = cancel.clone();
|
||||
let (mut destination_file, bytes_amount) = download_retry(
|
||||
|| async {
|
||||
let destination_file = tokio::fs::File::create(&temp_file_path)
|
||||
@@ -86,7 +87,7 @@ pub async fn download_layer_file<'a>(
|
||||
// Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
|
||||
// file: the write to local file doesn't start until after the request header is returned
|
||||
// and we start draining the body stream below
|
||||
let download = download_cancellable(cancel, storage.download(&remote_path))
|
||||
let download = download_cancellable(&cancel_inner, storage.download(&remote_path))
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -106,7 +107,7 @@ pub async fn download_layer_file<'a>(
|
||||
// we will imminiently try and write to again.
|
||||
let bytes_amount: u64 = match timeout_cancellable(
|
||||
DOWNLOAD_TIMEOUT,
|
||||
cancel,
|
||||
&cancel_inner,
|
||||
tokio::io::copy_buf(&mut reader, &mut destination_file),
|
||||
)
|
||||
.await
|
||||
@@ -385,11 +386,9 @@ pub(super) async fn download_index_part(
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"listing index_part files",
|
||||
&cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.and_then(|x| x)
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
@@ -472,7 +471,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
Err(other) => Err(other)?,
|
||||
};
|
||||
let mut download = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, file);
|
||||
let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
|
||||
|
||||
// TODO: this consumption of the response body should be subject to timeout + cancellation, but
|
||||
// not without thinking carefully about how to recover safely from cancelling a write to
|
||||
@@ -511,7 +510,7 @@ pub(crate) async fn download_initdb_tar_zst(
|
||||
|
||||
/// Helper function to handle retries for a download operation.
|
||||
///
|
||||
/// Remote operations can fail due to rate limits (S3), spurious network
|
||||
/// Remote operations can fail due to rate limits (IAM, S3), spurious network
|
||||
/// problems, or other external reasons. Retry FAILED_DOWNLOAD_RETRIES times,
|
||||
/// with backoff.
|
||||
///
|
||||
@@ -531,11 +530,9 @@ where
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
description,
|
||||
cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || DownloadError::Cancelled),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| DownloadError::Cancelled)
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
async fn download_retry_forever<T, O, F>(
|
||||
@@ -553,9 +550,7 @@ where
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
u32::MAX,
|
||||
description,
|
||||
&cancel,
|
||||
backoff::Cancel::new(cancel, || DownloadError::Cancelled),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| DownloadError::Cancelled)
|
||||
.and_then(|x| x)
|
||||
}
|
||||
|
||||
@@ -188,18 +188,16 @@ pub(crate) async fn time_travel_recover_tenant(
|
||||
backoff::retry(
|
||||
|| async {
|
||||
storage
|
||||
.time_travel_recover(Some(prefix), timestamp, done_if_after, cancel)
|
||||
.time_travel_recover(Some(prefix), timestamp, done_if_after, cancel.clone())
|
||||
.await
|
||||
},
|
||||
|e| !matches!(e, TimeTravelError::Other(_)),
|
||||
warn_after,
|
||||
max_attempts,
|
||||
"time travel recovery of tenant prefix",
|
||||
cancel,
|
||||
backoff::Cancel::new(cancel.clone(), || TimeTravelError::Cancelled),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| TimeTravelError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -537,11 +537,11 @@ impl<'a> TenantDownloader<'a> {
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"download heatmap",
|
||||
&self.secondary_state.cancel,
|
||||
backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
|
||||
UpdateError::Cancelled
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| UpdateError::Cancelled)
|
||||
.and_then(|x| x)?;
|
||||
.await?;
|
||||
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
|
||||
|
||||
@@ -371,6 +371,8 @@ async fn upload_tenant_heatmap(
|
||||
};
|
||||
let timelines = tenant.timelines.lock().unwrap().clone();
|
||||
|
||||
let tenant_cancel = tenant.cancel.clone();
|
||||
|
||||
// Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
|
||||
// when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
|
||||
// in remote storage.
|
||||
@@ -399,7 +401,6 @@ async fn upload_tenant_heatmap(
|
||||
|
||||
// Serialize the heatmap
|
||||
let bytes = serde_json::to_vec(&heatmap).map_err(|e| anyhow::anyhow!(e))?;
|
||||
let bytes = bytes::Bytes::from(bytes);
|
||||
let size = bytes.len();
|
||||
|
||||
// Drop out early if nothing changed since our last upload
|
||||
@@ -410,12 +411,13 @@ async fn upload_tenant_heatmap(
|
||||
|
||||
let path = remote_heatmap_path(tenant.get_tenant_shard_id());
|
||||
|
||||
let cancel = &tenant.cancel;
|
||||
|
||||
// Write the heatmap.
|
||||
tracing::debug!("Uploading {size} byte heatmap to {path}");
|
||||
if let Err(e) = backoff::retry(
|
||||
|| async {
|
||||
let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
|
||||
let bytes = futures::stream::once(futures::future::ready(Ok(bytes::Bytes::from(
|
||||
bytes.clone(),
|
||||
))));
|
||||
remote_storage
|
||||
.upload_storage_object(bytes, size, &path)
|
||||
.await
|
||||
@@ -424,13 +426,11 @@ async fn upload_tenant_heatmap(
|
||||
3,
|
||||
u32::MAX,
|
||||
"Uploading heatmap",
|
||||
cancel,
|
||||
backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Shutting down"))
|
||||
.and_then(|x| x)
|
||||
{
|
||||
if cancel.is_cancelled() {
|
||||
if tenant_cancel.is_cancelled() {
|
||||
return Err(UploadHeatmapError::Cancelled);
|
||||
} else {
|
||||
return Err(e.into());
|
||||
|
||||
17
pageserver/src/tenant/span.rs
Normal file
17
pageserver/src/tenant/span.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
#[cfg(debug_assertions)]
|
||||
use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
|
||||
|
||||
#[cfg(not(debug_assertions))]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
|
||||
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id"]));
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_id() {
|
||||
if let Err(missing) = check_fields_present!([&*TENANT_ID_EXTRACTOR]) {
|
||||
panic!("missing extractors: {missing:?}")
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,6 @@ use utils::sync::heavier_once_cell;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::repository::Key;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
|
||||
|
||||
use super::delta_layer::{self, DeltaEntry};
|
||||
@@ -300,8 +299,8 @@ impl Layer {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.0.info(reset).await
|
||||
pub(crate) fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
self.0.info(reset)
|
||||
}
|
||||
|
||||
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
|
||||
@@ -612,10 +611,10 @@ impl LayerInner {
|
||||
let mut rx = self.status.subscribe();
|
||||
|
||||
let strong = {
|
||||
match self.inner.get_mut().await {
|
||||
match self.inner.get() {
|
||||
Some(mut either) => {
|
||||
self.wanted_evicted.store(true, Ordering::Relaxed);
|
||||
ResidentOrWantedEvicted::downgrade(&mut either)
|
||||
either.downgrade()
|
||||
}
|
||||
None => return Err(EvictionError::NotFound),
|
||||
}
|
||||
@@ -641,7 +640,7 @@ impl LayerInner {
|
||||
// use however late (compared to the initial expressing of wanted) as the
|
||||
// "outcome" now
|
||||
LAYER_IMPL_METRICS.inc_broadcast_lagged();
|
||||
match self.inner.get_mut().await {
|
||||
match self.inner.get() {
|
||||
Some(_) => Err(EvictionError::Downloaded),
|
||||
None => Ok(()),
|
||||
}
|
||||
@@ -759,7 +758,7 @@ impl LayerInner {
|
||||
// use the already held initialization permit because it is impossible to hit the
|
||||
// below paths anymore essentially limiting the max loop iterations to 2.
|
||||
let (value, init_permit) = download(init_permit).await?;
|
||||
let mut guard = self.inner.set(value, init_permit).await;
|
||||
let mut guard = self.inner.set(value, init_permit);
|
||||
let (strong, _upgraded) = guard
|
||||
.get_and_upgrade()
|
||||
.expect("init creates strong reference, we held the init permit");
|
||||
@@ -767,7 +766,7 @@ impl LayerInner {
|
||||
}
|
||||
|
||||
let (weak, permit) = {
|
||||
let mut locked = self.inner.get_mut_or_init(download).await?;
|
||||
let mut locked = self.inner.get_or_init(download).await?;
|
||||
|
||||
if let Some((strong, upgraded)) = locked.get_and_upgrade() {
|
||||
if upgraded {
|
||||
@@ -837,8 +836,6 @@ impl LayerInner {
|
||||
timeline: Arc<Timeline>,
|
||||
permit: heavier_once_cell::InitPermit,
|
||||
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let task_name = format!("download layer {}", self);
|
||||
|
||||
let (tx, rx) = tokio::sync::oneshot::channel();
|
||||
@@ -989,12 +986,12 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
async fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
|
||||
let layer_file_name = self.desc.filename().file_name();
|
||||
|
||||
// this is not accurate: we could have the file locally but there was a cancellation
|
||||
// and now we are not in sync, or we are currently downloading it.
|
||||
let remote = self.inner.get_mut().await.is_none();
|
||||
let remote = self.inner.get().is_none();
|
||||
|
||||
let access_stats = self.access_stats.as_api_model(reset);
|
||||
|
||||
@@ -1053,7 +1050,7 @@ impl LayerInner {
|
||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||
return;
|
||||
};
|
||||
match tokio::runtime::Handle::current().block_on(this.evict_blocking(version)) {
|
||||
match this.evict_blocking(version) {
|
||||
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
|
||||
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
|
||||
}
|
||||
@@ -1061,7 +1058,7 @@ impl LayerInner {
|
||||
}
|
||||
}
|
||||
|
||||
async fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
||||
fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
|
||||
// deleted or detached timeline, don't do anything.
|
||||
let Some(timeline) = self.timeline.upgrade() else {
|
||||
return Err(EvictionCancelled::TimelineGone);
|
||||
@@ -1070,7 +1067,7 @@ impl LayerInner {
|
||||
// to avoid starting a new download while we evict, keep holding on to the
|
||||
// permit.
|
||||
let _permit = {
|
||||
let maybe_downloaded = self.inner.get_mut().await;
|
||||
let maybe_downloaded = self.inner.get();
|
||||
|
||||
let (_weak, permit) = match maybe_downloaded {
|
||||
Some(mut guard) => {
|
||||
|
||||
@@ -199,9 +199,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
|
||||
|
||||
// Perhaps we did no work and the walredo process has been idle for some time:
|
||||
// give it a chance to shut down to avoid leaving walredo process running indefinitely.
|
||||
if let Some(walredo_mgr) = &tenant.walredo_mgr {
|
||||
walredo_mgr.maybe_quiesce(period * 10);
|
||||
}
|
||||
tenant.walredo_mgr.maybe_quiesce(period * 10);
|
||||
|
||||
// Sleep
|
||||
if tokio::time::timeout(sleep_duration, cancel.cancelled())
|
||||
|
||||
@@ -215,8 +215,8 @@ pub struct Timeline {
|
||||
// Atomic would be more appropriate here.
|
||||
last_freeze_ts: RwLock<Instant>,
|
||||
|
||||
// WAL redo manager. `None` only for broken tenants.
|
||||
walredo_mgr: Option<Arc<super::WalRedoManager>>,
|
||||
// WAL redo manager
|
||||
walredo_mgr: Arc<super::WalRedoManager>,
|
||||
|
||||
/// Remote storage client.
|
||||
/// See [`remote_timeline_client`](super::remote_timeline_client) module comment for details.
|
||||
@@ -1138,7 +1138,7 @@ impl Timeline {
|
||||
/// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of
|
||||
/// the graceful [`Timeline::flush_and_shutdown`] function.
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
// Signal any subscribers to our cancellation token to drop out
|
||||
tracing::debug!("Cancelling CancellationToken");
|
||||
@@ -1268,7 +1268,7 @@ impl Timeline {
|
||||
let mut historic_layers = Vec::new();
|
||||
for historic_layer in layer_map.iter_historic_layers() {
|
||||
let historic_layer = guard.get_from_desc(&historic_layer);
|
||||
historic_layers.push(historic_layer.info(reset).await);
|
||||
historic_layers.push(historic_layer.info(reset));
|
||||
}
|
||||
|
||||
LayerMapInfo {
|
||||
@@ -1427,7 +1427,7 @@ impl Timeline {
|
||||
tenant_shard_id: TenantShardId,
|
||||
generation: Generation,
|
||||
shard_identity: ShardIdentity,
|
||||
walredo_mgr: Option<Arc<super::WalRedoManager>>,
|
||||
walredo_mgr: Arc<super::WalRedoManager>,
|
||||
resources: TimelineResources,
|
||||
pg_version: u32,
|
||||
state: TimelineState,
|
||||
@@ -1964,7 +1964,7 @@ impl Timeline {
|
||||
.await;
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id)),
|
||||
.instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2151,7 +2151,7 @@ impl Timeline {
|
||||
cause: LogicalSizeCalculationCause,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u64, CalculateLogicalSizeError> {
|
||||
crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// We should never be calculating logical sizes on shard !=0, because these shards do not have
|
||||
// accurate relation sizes, and they do not emit consumption metrics.
|
||||
debug_assert!(self.tenant_shard_id.is_zero());
|
||||
@@ -2849,7 +2849,7 @@ impl Timeline {
|
||||
frozen_layer: Arc<InMemoryLayer>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), FlushLayerError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
@@ -4457,9 +4457,6 @@ impl Timeline {
|
||||
|
||||
let img = match self
|
||||
.walredo_mgr
|
||||
.as_ref()
|
||||
.context("timeline has no walredo manager")
|
||||
.map_err(PageReconstructError::WalRedo)?
|
||||
.request_redo(key, request_lsn, data.img, data.records, self.pg_version)
|
||||
.await
|
||||
.context("reconstruct a page image")
|
||||
|
||||
@@ -1 +1,20 @@
|
||||
#[cfg(debug_assertions)]
|
||||
use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
|
||||
|
||||
#[cfg(not(debug_assertions))]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
#[track_caller]
|
||||
pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
|
||||
static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<1>> =
|
||||
once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TimelineId", ["timeline_id"]));
|
||||
|
||||
let fields: [&dyn Extractor; 2] = [
|
||||
&*crate::tenant::span::TENANT_ID_EXTRACTOR,
|
||||
&*TIMELINE_ID_EXTRACTOR,
|
||||
];
|
||||
if let Err(missing) = check_fields_present!(fields) {
|
||||
panic!("missing extractors: {missing:?}")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,30 +17,71 @@
|
||||
//! records. It achieves it by dropping privileges before replaying
|
||||
//! any WAL records, so that even if an attacker hijacks the Postgres
|
||||
//! process, he cannot escape out of it.
|
||||
|
||||
/// Process lifecycle and abstracction for the IPC protocol.
|
||||
mod process;
|
||||
|
||||
/// Code to apply [`NeonWalRecord`]s.
|
||||
mod apply_neon;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::{
|
||||
WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
|
||||
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
|
||||
};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
//!
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use nix::poll::*;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use serde::Serialize;
|
||||
use std::collections::VecDeque;
|
||||
use std::io;
|
||||
use std::io::prelude::*;
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::process::Stdio;
|
||||
use std::process::{Child, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::{
|
||||
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
|
||||
WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
|
||||
};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
use postgres_ffi::v14::nonrelfile_utils::{
|
||||
mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
|
||||
transaction_id_set_status,
|
||||
};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
|
||||
pub(crate) struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
///
|
||||
/// This is the real implementation that uses a Postgres process to
|
||||
@@ -53,7 +94,22 @@ pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
redo_process: RwLock<Option<Arc<process::WalRedoProcess>>>,
|
||||
redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
|
||||
}
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
/// or we need to pass it to wal-redo postgres process?
|
||||
fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
|
||||
// Currently, we don't have bespoken Rust code to replay any
|
||||
// Postgres WAL records. But everything else is handled in neon.
|
||||
#[allow(clippy::match_like_matches_macro)]
|
||||
match rec {
|
||||
NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: _,
|
||||
} => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -83,10 +139,10 @@ impl PostgresRedoManager {
|
||||
|
||||
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
|
||||
let mut img = base_img.map(|p| p.1);
|
||||
let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
|
||||
let mut batch_neon = can_apply_in_neon(&records[0].1);
|
||||
let mut batch_start = 0;
|
||||
for (i, record) in records.iter().enumerate().skip(1) {
|
||||
let rec_neon = apply_neon::can_apply_in_neon(&record.1);
|
||||
let rec_neon = can_apply_in_neon(&record.1);
|
||||
|
||||
if rec_neon != batch_neon {
|
||||
let result = if batch_neon {
|
||||
@@ -192,7 +248,7 @@ impl PostgresRedoManager {
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
// launch the WAL redo process on first use
|
||||
let proc: Arc<process::WalRedoProcess> = {
|
||||
let proc: Arc<WalRedoProcess> = {
|
||||
let proc_guard = self.redo_process.read().unwrap();
|
||||
match &*proc_guard {
|
||||
None => {
|
||||
@@ -203,7 +259,7 @@ impl PostgresRedoManager {
|
||||
None => {
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::WalRedoProcess::launch(
|
||||
WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
@@ -231,8 +287,9 @@ impl PostgresRedoManager {
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
let result = proc
|
||||
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
|
||||
.apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
|
||||
.context("apply_wal_records");
|
||||
|
||||
let duration = started_at.elapsed();
|
||||
@@ -359,12 +416,732 @@ impl PostgresRedoManager {
|
||||
_record_lsn: Lsn,
|
||||
record: &NeonWalRecord,
|
||||
) -> anyhow::Result<()> {
|
||||
apply_neon::apply_in_neon(record, key, page)?;
|
||||
match record {
|
||||
NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: _,
|
||||
} => {
|
||||
anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
|
||||
}
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relation
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
assert!(
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
rel
|
||||
);
|
||||
if let Some(heap_blkno) = *new_heap_blkno {
|
||||
// Calculate the VM block and offset that corresponds to the heap block.
|
||||
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
|
||||
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
|
||||
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
|
||||
|
||||
// Check that we're modifying the correct VM block.
|
||||
assert!(map_block == blknum);
|
||||
|
||||
// equivalent to PageGetContents(page)
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
}
|
||||
|
||||
// Repeat for 'old_heap_blkno', if any
|
||||
if let Some(heap_blkno) = *old_heap_blkno {
|
||||
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
|
||||
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
|
||||
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
|
||||
|
||||
assert!(map_block == blknum);
|
||||
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
}
|
||||
}
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetCommitted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
|
||||
transaction_id_set_status(
|
||||
xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
page,
|
||||
);
|
||||
}
|
||||
|
||||
// Append the timestamp
|
||||
if page.len() == BLCKSZ as usize + 8 {
|
||||
page.truncate(BLCKSZ as usize);
|
||||
}
|
||||
if page.len() == BLCKSZ as usize {
|
||||
page.extend_from_slice(×tamp.to_be_bytes());
|
||||
} else {
|
||||
warn!(
|
||||
"CLOG blk {} in seg {} has invalid size {}",
|
||||
blknum,
|
||||
segno,
|
||||
page.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
NeonWalRecord::ClogSetAborted { xids } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetAborted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
|
||||
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
|
||||
}
|
||||
}
|
||||
NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactOffsets,
|
||||
"MultixactOffsetCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
// Compute the block and offset to modify.
|
||||
// See RecordNewMultiXact in PostgreSQL sources.
|
||||
let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let offset = (entryno * 4) as usize;
|
||||
|
||||
// Check that we're modifying the correct multixact-offsets block.
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
);
|
||||
|
||||
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
|
||||
}
|
||||
NeonWalRecord::MultixactMembersCreate { moff, members } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactMembers,
|
||||
"MultixactMembersCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for (i, member) in members.iter().enumerate() {
|
||||
let offset = moff + i as u32;
|
||||
|
||||
// Compute the block and offset to modify.
|
||||
// See RecordNewMultiXact in PostgreSQL sources.
|
||||
let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
|
||||
// Check that we're modifying the correct multixact-members block.
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
);
|
||||
|
||||
let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
|
||||
flagsval |= member.status << bshift;
|
||||
LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
|
||||
LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
|
||||
fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
fn apply_wal_records(
|
||||
&self,
|
||||
tag: BufferTag,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper type around `std::process::Child` which guarantees that the child
|
||||
/// will be killed and waited-for by this process before being dropped.
|
||||
struct NoLeakChild {
|
||||
tenant_id: TenantShardId,
|
||||
child: Option<Child>,
|
||||
}
|
||||
|
||||
impl Deref for NoLeakChild {
|
||||
type Target = Child;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.child.as_ref().expect("must not use from drop")
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for NoLeakChild {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.child.as_mut().expect("must not use from drop")
|
||||
}
|
||||
}
|
||||
|
||||
impl NoLeakChild {
|
||||
fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
|
||||
let child = command.spawn()?;
|
||||
Ok(NoLeakChild {
|
||||
tenant_id,
|
||||
child: Some(child),
|
||||
})
|
||||
}
|
||||
|
||||
fn kill_and_wait(mut self, cause: WalRedoKillCause) {
|
||||
let child = match self.child.take() {
|
||||
Some(child) => child,
|
||||
None => return,
|
||||
};
|
||||
Self::kill_and_wait_impl(child, cause);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(pid=child.id(), ?cause))]
|
||||
fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
|
||||
scopeguard::defer! {
|
||||
WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
|
||||
}
|
||||
let res = child.kill();
|
||||
if let Err(e) = res {
|
||||
// This branch is very unlikely because:
|
||||
// - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
|
||||
// - This is the only place that calls .kill()
|
||||
// - We consume `self`, so, .kill() can't be called twice.
|
||||
// - If the process exited by itself or was killed by someone else,
|
||||
// .kill() will still succeed because we haven't wait()'ed yet.
|
||||
//
|
||||
// So, if we arrive here, we have really no idea what happened,
|
||||
// whether the PID stored in self.child is still valid, etc.
|
||||
// If this function were fallible, we'd return an error, but
|
||||
// since it isn't, all we can do is log an error and proceed
|
||||
// with the wait().
|
||||
error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
|
||||
}
|
||||
|
||||
match child.wait() {
|
||||
Ok(exit_status) => {
|
||||
info!(exit_status = %exit_status, "wait successful");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for NoLeakChild {
|
||||
fn drop(&mut self) {
|
||||
let child = match self.child.take() {
|
||||
Some(child) => child,
|
||||
None => return,
|
||||
};
|
||||
let tenant_shard_id = self.tenant_id;
|
||||
// Offload the kill+wait of the child process into the background.
|
||||
// If someone stops the runtime, we'll leak the child process.
|
||||
// We can ignore that case because we only stop the runtime on pageserver exit.
|
||||
tokio::runtime::Handle::current().spawn(async move {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// Intentionally don't inherit the tracing context from whoever is dropping us.
|
||||
// This thread here is going to outlive of our dropper.
|
||||
let span = tracing::info_span!(
|
||||
"walredo",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
);
|
||||
let _entered = span.enter();
|
||||
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
|
||||
})
|
||||
.await
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
trait NoLeakChildCommandExt {
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
|
||||
}
|
||||
|
||||
impl NoLeakChildCommandExt for Command {
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
|
||||
NoLeakChild::spawn(tenant_id, self)
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for constructing messages to send to the postgres WAL redo
|
||||
// process. See pgxn/neon_walredo/walredoproc.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
buf.put(base_img);
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
|
||||
let len = 4 + 8 + rec.len();
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::PostgresRedoManager;
|
||||
@@ -373,7 +1150,6 @@ mod tests {
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::str::FromStr;
|
||||
use tracing::Instrument;
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
#[tokio::test]
|
||||
@@ -398,7 +1174,6 @@ mod tests {
|
||||
short_records(),
|
||||
14,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -426,7 +1201,6 @@ mod tests {
|
||||
short_records(),
|
||||
14,
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
@@ -447,7 +1221,6 @@ mod tests {
|
||||
short_records(),
|
||||
16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
|
||||
)
|
||||
.instrument(h.span())
|
||||
.await
|
||||
.unwrap_err();
|
||||
}
|
||||
@@ -476,7 +1249,6 @@ mod tests {
|
||||
// underscored because unused, except for removal at drop
|
||||
_repo_dir: camino_tempfile::Utf8TempDir,
|
||||
manager: PostgresRedoManager,
|
||||
tenant_shard_id: TenantShardId,
|
||||
}
|
||||
|
||||
impl RedoHarness {
|
||||
@@ -493,11 +1265,7 @@ mod tests {
|
||||
Ok(RedoHarness {
|
||||
_repo_dir: repo_dir,
|
||||
manager,
|
||||
tenant_shard_id,
|
||||
})
|
||||
}
|
||||
fn span(&self) -> tracing::Span {
|
||||
tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,235 +0,0 @@
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
|
||||
use pageserver_api::reltag::SlruKind;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
use postgres_ffi::v14::nonrelfile_utils::{
|
||||
mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
|
||||
transaction_id_set_status,
|
||||
};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use tracing::*;
|
||||
|
||||
/// Can this request be served by neon redo functions
|
||||
/// or we need to pass it to wal-redo postgres process?
|
||||
pub(crate) fn can_apply_in_neon(rec: &NeonWalRecord) -> bool {
|
||||
// Currently, we don't have bespoken Rust code to replay any
|
||||
// Postgres WAL records. But everything else is handled in neon.
|
||||
#[allow(clippy::match_like_matches_macro)]
|
||||
match rec {
|
||||
NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: _,
|
||||
} => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn apply_in_neon(
|
||||
record: &NeonWalRecord,
|
||||
key: Key,
|
||||
page: &mut BytesMut,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
match record {
|
||||
NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: _,
|
||||
} => {
|
||||
anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
|
||||
}
|
||||
NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relation
|
||||
let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
|
||||
assert!(
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
rel
|
||||
);
|
||||
if let Some(heap_blkno) = *new_heap_blkno {
|
||||
// Calculate the VM block and offset that corresponds to the heap block.
|
||||
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
|
||||
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
|
||||
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
|
||||
|
||||
// Check that we're modifying the correct VM block.
|
||||
assert!(map_block == blknum);
|
||||
|
||||
// equivalent to PageGetContents(page)
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
}
|
||||
|
||||
// Repeat for 'old_heap_blkno', if any
|
||||
if let Some(heap_blkno) = *old_heap_blkno {
|
||||
let map_block = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blkno);
|
||||
let map_byte = pg_constants::HEAPBLK_TO_MAPBYTE(heap_blkno);
|
||||
let map_offset = pg_constants::HEAPBLK_TO_OFFSET(heap_blkno);
|
||||
|
||||
assert!(map_block == blknum);
|
||||
|
||||
let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
|
||||
|
||||
map[map_byte as usize] &= !(flags << map_offset);
|
||||
}
|
||||
}
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
NeonWalRecord::ClogSetCommitted { xids, timestamp } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetCommitted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
|
||||
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_COMMITTED, page);
|
||||
}
|
||||
|
||||
// Append the timestamp
|
||||
if page.len() == BLCKSZ as usize + 8 {
|
||||
page.truncate(BLCKSZ as usize);
|
||||
}
|
||||
if page.len() == BLCKSZ as usize {
|
||||
page.extend_from_slice(×tamp.to_be_bytes());
|
||||
} else {
|
||||
warn!(
|
||||
"CLOG blk {} in seg {} has invalid size {}",
|
||||
blknum,
|
||||
segno,
|
||||
page.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
NeonWalRecord::ClogSetAborted { xids } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetAborted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
|
||||
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
|
||||
}
|
||||
}
|
||||
NeonWalRecord::MultixactOffsetCreate { mid, moff } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactOffsets,
|
||||
"MultixactOffsetCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
// Compute the block and offset to modify.
|
||||
// See RecordNewMultiXact in PostgreSQL sources.
|
||||
let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let entryno = mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
let offset = (entryno * 4) as usize;
|
||||
|
||||
// Check that we're modifying the correct multixact-offsets block.
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
);
|
||||
|
||||
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
|
||||
}
|
||||
NeonWalRecord::MultixactMembersCreate { moff, members } => {
|
||||
let (slru_kind, segno, blknum) = key_to_slru_block(key).context("invalid record")?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactMembers,
|
||||
"MultixactMembersCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for (i, member) in members.iter().enumerate() {
|
||||
let offset = moff + i as u32;
|
||||
|
||||
// Compute the block and offset to modify.
|
||||
// See RecordNewMultiXact in PostgreSQL sources.
|
||||
let pageno = offset / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
|
||||
// Check that we're modifying the correct multixact-members block.
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
segno == expected_segno,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
);
|
||||
|
||||
let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
|
||||
flagsval |= member.status << bshift;
|
||||
LittleEndian::write_u32(&mut page[flagsoff..flagsoff + 4], flagsval);
|
||||
LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,408 +0,0 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Write},
|
||||
process::{ChildStdin, ChildStdout, Command, Stdio},
|
||||
sync::{Mutex, MutexGuard},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
mod no_leak_child;
|
||||
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
|
||||
mod protocol;
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
@@ -1,126 +0,0 @@
|
||||
use tracing;
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
use tracing::instrument;
|
||||
|
||||
use crate::metrics::WalRedoKillCause;
|
||||
use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
|
||||
|
||||
use std::io;
|
||||
use std::process::Command;
|
||||
|
||||
use std::ops::DerefMut;
|
||||
|
||||
use std::ops::Deref;
|
||||
|
||||
use std::process::Child;
|
||||
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
|
||||
/// Wrapper type around `std::process::Child` which guarantees that the child
|
||||
/// will be killed and waited-for by this process before being dropped.
|
||||
pub(crate) struct NoLeakChild {
|
||||
pub(crate) tenant_id: TenantShardId,
|
||||
pub(crate) child: Option<Child>,
|
||||
}
|
||||
|
||||
impl Deref for NoLeakChild {
|
||||
type Target = Child;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.child.as_ref().expect("must not use from drop")
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for NoLeakChild {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.child.as_mut().expect("must not use from drop")
|
||||
}
|
||||
}
|
||||
|
||||
impl NoLeakChild {
|
||||
pub(crate) fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
|
||||
let child = command.spawn()?;
|
||||
Ok(NoLeakChild {
|
||||
tenant_id,
|
||||
child: Some(child),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn kill_and_wait(mut self, cause: WalRedoKillCause) {
|
||||
let child = match self.child.take() {
|
||||
Some(child) => child,
|
||||
None => return,
|
||||
};
|
||||
Self::kill_and_wait_impl(child, cause);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(pid=child.id(), ?cause))]
|
||||
pub(crate) fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
|
||||
scopeguard::defer! {
|
||||
WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
|
||||
}
|
||||
let res = child.kill();
|
||||
if let Err(e) = res {
|
||||
// This branch is very unlikely because:
|
||||
// - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
|
||||
// - This is the only place that calls .kill()
|
||||
// - We consume `self`, so, .kill() can't be called twice.
|
||||
// - If the process exited by itself or was killed by someone else,
|
||||
// .kill() will still succeed because we haven't wait()'ed yet.
|
||||
//
|
||||
// So, if we arrive here, we have really no idea what happened,
|
||||
// whether the PID stored in self.child is still valid, etc.
|
||||
// If this function were fallible, we'd return an error, but
|
||||
// since it isn't, all we can do is log an error and proceed
|
||||
// with the wait().
|
||||
error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
|
||||
}
|
||||
|
||||
match child.wait() {
|
||||
Ok(exit_status) => {
|
||||
info!(exit_status = %exit_status, "wait successful");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for NoLeakChild {
|
||||
fn drop(&mut self) {
|
||||
let child = match self.child.take() {
|
||||
Some(child) => child,
|
||||
None => return,
|
||||
};
|
||||
let tenant_shard_id = self.tenant_id;
|
||||
// Offload the kill+wait of the child process into the background.
|
||||
// If someone stops the runtime, we'll leak the child process.
|
||||
// We can ignore that case because we only stop the runtime on pageserver exit.
|
||||
tokio::runtime::Handle::current().spawn(async move {
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// Intentionally don't inherit the tracing context from whoever is dropping us.
|
||||
// This thread here is going to outlive of our dropper.
|
||||
let span = tracing::info_span!(
|
||||
"walredo",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
);
|
||||
let _entered = span.enter();
|
||||
Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
|
||||
})
|
||||
.await
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait NoLeakChildCommandExt {
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
|
||||
}
|
||||
|
||||
impl NoLeakChildCommandExt for Command {
|
||||
fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
|
||||
NoLeakChild::spawn(tenant_id, self)
|
||||
}
|
||||
}
|
||||
@@ -1,57 +0,0 @@
|
||||
use bytes::BufMut;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use serde::Serialize;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
///
|
||||
/// In Postgres `BufferTag` structure is used for exactly the same purpose.
|
||||
/// [See more related comments here](https://github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/buf_internals.h#L91).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Serialize)]
|
||||
pub(crate) struct BufferTag {
|
||||
pub rel: RelTag,
|
||||
pub blknum: u32,
|
||||
}
|
||||
|
||||
pub(crate) fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
|
||||
pub(crate) fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
buf.put(base_img);
|
||||
}
|
||||
|
||||
pub(crate) fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
|
||||
let len = 4 + 8 + rec.len();
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
}
|
||||
|
||||
pub(crate) fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
}
|
||||
@@ -158,10 +158,7 @@ static XLogReaderState *reader_state;
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <errno.h>
|
||||
|
||||
static int
|
||||
close_range_syscall(unsigned int start_fd, unsigned int count, unsigned int flags)
|
||||
{
|
||||
int close_range(unsigned int start_fd, unsigned int count, unsigned int flags) {
|
||||
return syscall(__NR_close_range, start_fd, count, flags);
|
||||
}
|
||||
|
||||
@@ -175,7 +172,7 @@ enter_seccomp_mode(void)
|
||||
* wal records. See the comment in the Rust code that launches this process.
|
||||
*/
|
||||
int err;
|
||||
if (err = close_range_syscall(3, ~0U, 0)) {
|
||||
if (err = close_range(3, ~0U, 0)) {
|
||||
ereport(FATAL, (errcode(ERRCODE_SYSTEM_ERROR), errmsg("seccomp: could not close files >= fd 3")));
|
||||
}
|
||||
|
||||
|
||||
65
poetry.lock
generated
65
poetry.lock
generated
@@ -836,56 +836,47 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "42.0.0"
|
||||
version = "41.0.6"
|
||||
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
|
||||
{file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
|
||||
{file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
|
||||
{file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
|
||||
{file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
|
||||
{file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
|
||||
{file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
|
||||
{file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
|
||||
{file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
|
||||
{file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
|
||||
{file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
|
||||
{file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
|
||||
{file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
|
||||
{file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
|
||||
{file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
|
||||
{file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
|
||||
{file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
|
||||
{file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
|
||||
{file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
|
||||
{file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
cffi = {version = ">=1.12", markers = "platform_python_implementation != \"PyPy\""}
|
||||
cffi = ">=1.12"
|
||||
|
||||
[package.extras]
|
||||
docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"]
|
||||
docstest = ["pyenchant (>=1.6.11)", "readme-renderer", "sphinxcontrib-spelling (>=4.0.1)"]
|
||||
docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
|
||||
nox = ["nox"]
|
||||
pep8test = ["check-sdist", "click", "mypy", "ruff"]
|
||||
pep8test = ["black", "check-sdist", "mypy", "ruff"]
|
||||
sdist = ["build"]
|
||||
ssh = ["bcrypt (>=3.1.5)"]
|
||||
test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
|
||||
test-randomorder = ["pytest-randomly"]
|
||||
|
||||
[[package]]
|
||||
|
||||
@@ -31,7 +31,6 @@ hyper-tungstenite.workspace = true
|
||||
hyper.workspace = true
|
||||
ipnet.workspace = true
|
||||
itertools.workspace = true
|
||||
lasso = { workspace = true, features = ["multi-threaded"] }
|
||||
md5.workspace = true
|
||||
metrics.workspace = true
|
||||
once_cell.workspace = true
|
||||
@@ -93,4 +92,3 @@ rcgen.workspace = true
|
||||
rstest.workspace = true
|
||||
tokio-postgres-rustls.workspace = true
|
||||
walkdir.workspace = true
|
||||
rand_distr = "0.4"
|
||||
|
||||
84
proxy/src/cache/project_info.rs
vendored
84
proxy/src/cache/project_info.rs
vendored
@@ -12,18 +12,15 @@ use tokio::time::Instant;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::{
|
||||
auth::IpPattern,
|
||||
config::ProjectInfoCacheOptions,
|
||||
console::AuthSecret,
|
||||
intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
|
||||
EndpointId, ProjectId, RoleName,
|
||||
auth::IpPattern, config::ProjectInfoCacheOptions, console::AuthSecret, EndpointId, ProjectId,
|
||||
RoleName,
|
||||
};
|
||||
|
||||
use super::{Cache, Cached};
|
||||
|
||||
pub trait ProjectInfoCache {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt);
|
||||
fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId);
|
||||
fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName);
|
||||
fn enable_ttl(&self);
|
||||
fn disable_ttl(&self);
|
||||
}
|
||||
@@ -50,7 +47,7 @@ impl<T> From<T> for Entry<T> {
|
||||
|
||||
#[derive(Default)]
|
||||
struct EndpointInfo {
|
||||
secret: std::collections::HashMap<RoleNameInt, Entry<Option<AuthSecret>>>,
|
||||
secret: std::collections::HashMap<RoleName, Entry<Option<AuthSecret>>>,
|
||||
allowed_ips: Option<Entry<Arc<Vec<IpPattern>>>>,
|
||||
}
|
||||
|
||||
@@ -63,11 +60,11 @@ impl EndpointInfo {
|
||||
}
|
||||
pub fn get_role_secret(
|
||||
&self,
|
||||
role_name: RoleNameInt,
|
||||
role_name: &RoleName,
|
||||
valid_since: Instant,
|
||||
ignore_cache_since: Option<Instant>,
|
||||
) -> Option<(Option<AuthSecret>, bool)> {
|
||||
if let Some(secret) = self.secret.get(&role_name) {
|
||||
if let Some(secret) = self.secret.get(role_name) {
|
||||
if valid_since < secret.created_at {
|
||||
return Some((
|
||||
secret.value.clone(),
|
||||
@@ -96,8 +93,8 @@ impl EndpointInfo {
|
||||
pub fn invalidate_allowed_ips(&mut self) {
|
||||
self.allowed_ips = None;
|
||||
}
|
||||
pub fn invalidate_role_secret(&mut self, role_name: RoleNameInt) {
|
||||
self.secret.remove(&role_name);
|
||||
pub fn invalidate_role_secret(&mut self, role_name: &RoleName) {
|
||||
self.secret.remove(role_name);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,9 +106,9 @@ impl EndpointInfo {
|
||||
/// One may ask, why the data is stored per project, when on the user request there is only data about the endpoint available?
|
||||
/// On the cplane side updates are done per project (or per branch), so it's easier to invalidate the whole project cache.
|
||||
pub struct ProjectInfoCacheImpl {
|
||||
cache: DashMap<EndpointIdInt, EndpointInfo>,
|
||||
cache: DashMap<EndpointId, EndpointInfo>,
|
||||
|
||||
project2ep: DashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
|
||||
project2ep: DashMap<ProjectId, HashSet<EndpointId>>,
|
||||
config: ProjectInfoCacheOptions,
|
||||
|
||||
start_time: Instant,
|
||||
@@ -119,11 +116,11 @@ pub struct ProjectInfoCacheImpl {
|
||||
}
|
||||
|
||||
impl ProjectInfoCache for ProjectInfoCacheImpl {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: ProjectIdInt) {
|
||||
fn invalidate_allowed_ips_for_project(&self, project_id: &ProjectId) {
|
||||
info!("invalidating allowed ips for project `{}`", project_id);
|
||||
let endpoints = self
|
||||
.project2ep
|
||||
.get(&project_id)
|
||||
.get(project_id)
|
||||
.map(|kv| kv.value().clone())
|
||||
.unwrap_or_default();
|
||||
for endpoint_id in endpoints {
|
||||
@@ -132,14 +129,14 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
|
||||
}
|
||||
}
|
||||
}
|
||||
fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt) {
|
||||
fn invalidate_role_secret_for_project(&self, project_id: &ProjectId, role_name: &RoleName) {
|
||||
info!(
|
||||
"invalidating role secret for project_id `{}` and role_name `{}`",
|
||||
project_id, role_name,
|
||||
project_id, role_name
|
||||
);
|
||||
let endpoints = self
|
||||
.project2ep
|
||||
.get(&project_id)
|
||||
.get(project_id)
|
||||
.map(|kv| kv.value().clone())
|
||||
.unwrap_or_default();
|
||||
for endpoint_id in endpoints {
|
||||
@@ -176,17 +173,15 @@ impl ProjectInfoCacheImpl {
|
||||
endpoint_id: &EndpointId,
|
||||
role_name: &RoleName,
|
||||
) -> Option<Cached<&Self, Option<AuthSecret>>> {
|
||||
let endpoint_id = EndpointIdInt::get(endpoint_id)?;
|
||||
let role_name = RoleNameInt::get(role_name)?;
|
||||
let (valid_since, ignore_cache_since) = self.get_cache_times();
|
||||
let endpoint_info = self.cache.get(&endpoint_id)?;
|
||||
let endpoint_info = self.cache.get(endpoint_id)?;
|
||||
let (value, ignore_cache) =
|
||||
endpoint_info.get_role_secret(role_name, valid_since, ignore_cache_since)?;
|
||||
if !ignore_cache {
|
||||
let cached = Cached {
|
||||
token: Some((
|
||||
self,
|
||||
CachedLookupInfo::new_role_secret(endpoint_id, role_name),
|
||||
CachedLookupInfo::new_role_secret(endpoint_id.clone(), role_name.clone()),
|
||||
)),
|
||||
value,
|
||||
};
|
||||
@@ -198,14 +193,13 @@ impl ProjectInfoCacheImpl {
|
||||
&self,
|
||||
endpoint_id: &EndpointId,
|
||||
) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
|
||||
let endpoint_id = EndpointIdInt::get(endpoint_id)?;
|
||||
let (valid_since, ignore_cache_since) = self.get_cache_times();
|
||||
let endpoint_info = self.cache.get(&endpoint_id)?;
|
||||
let endpoint_info = self.cache.get(endpoint_id)?;
|
||||
let value = endpoint_info.get_allowed_ips(valid_since, ignore_cache_since);
|
||||
let (value, ignore_cache) = value?;
|
||||
if !ignore_cache {
|
||||
let cached = Cached {
|
||||
token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id))),
|
||||
token: Some((self, CachedLookupInfo::new_allowed_ips(endpoint_id.clone()))),
|
||||
value,
|
||||
};
|
||||
return Some(cached);
|
||||
@@ -219,17 +213,14 @@ impl ProjectInfoCacheImpl {
|
||||
role_name: &RoleName,
|
||||
secret: Option<AuthSecret>,
|
||||
) {
|
||||
let project_id = ProjectIdInt::from(project_id);
|
||||
let endpoint_id = EndpointIdInt::from(endpoint_id);
|
||||
let role_name = RoleNameInt::from(role_name);
|
||||
if self.cache.len() >= self.config.size {
|
||||
// If there are too many entries, wait until the next gc cycle.
|
||||
return;
|
||||
}
|
||||
self.insert_project2endpoint(project_id, endpoint_id);
|
||||
let mut entry = self.cache.entry(endpoint_id).or_default();
|
||||
self.inser_project2endpoint(project_id, endpoint_id);
|
||||
let mut entry = self.cache.entry(endpoint_id.clone()).or_default();
|
||||
if entry.secret.len() < self.config.max_roles {
|
||||
entry.secret.insert(role_name, secret.into());
|
||||
entry.secret.insert(role_name.clone(), secret.into());
|
||||
}
|
||||
}
|
||||
pub fn insert_allowed_ips(
|
||||
@@ -238,21 +229,22 @@ impl ProjectInfoCacheImpl {
|
||||
endpoint_id: &EndpointId,
|
||||
allowed_ips: Arc<Vec<IpPattern>>,
|
||||
) {
|
||||
let project_id = ProjectIdInt::from(project_id);
|
||||
let endpoint_id = EndpointIdInt::from(endpoint_id);
|
||||
if self.cache.len() >= self.config.size {
|
||||
// If there are too many entries, wait until the next gc cycle.
|
||||
return;
|
||||
}
|
||||
self.insert_project2endpoint(project_id, endpoint_id);
|
||||
self.cache.entry(endpoint_id).or_default().allowed_ips = Some(allowed_ips.into());
|
||||
self.inser_project2endpoint(project_id, endpoint_id);
|
||||
self.cache
|
||||
.entry(endpoint_id.clone())
|
||||
.or_default()
|
||||
.allowed_ips = Some(allowed_ips.into());
|
||||
}
|
||||
fn insert_project2endpoint(&self, project_id: ProjectIdInt, endpoint_id: EndpointIdInt) {
|
||||
if let Some(mut endpoints) = self.project2ep.get_mut(&project_id) {
|
||||
endpoints.insert(endpoint_id);
|
||||
fn inser_project2endpoint(&self, project_id: &ProjectId, endpoint_id: &EndpointId) {
|
||||
if let Some(mut endpoints) = self.project2ep.get_mut(project_id) {
|
||||
endpoints.insert(endpoint_id.clone());
|
||||
} else {
|
||||
self.project2ep
|
||||
.insert(project_id, HashSet::from([endpoint_id]));
|
||||
.insert(project_id.clone(), HashSet::from([endpoint_id.clone()]));
|
||||
}
|
||||
}
|
||||
fn get_cache_times(&self) -> (Instant, Option<Instant>) {
|
||||
@@ -308,18 +300,18 @@ impl ProjectInfoCacheImpl {
|
||||
/// This is used to invalidate cache entries.
|
||||
pub struct CachedLookupInfo {
|
||||
/// Search by this key.
|
||||
endpoint_id: EndpointIdInt,
|
||||
endpoint_id: EndpointId,
|
||||
lookup_type: LookupType,
|
||||
}
|
||||
|
||||
impl CachedLookupInfo {
|
||||
pub(self) fn new_role_secret(endpoint_id: EndpointIdInt, role_name: RoleNameInt) -> Self {
|
||||
pub(self) fn new_role_secret(endpoint_id: EndpointId, role_name: RoleName) -> Self {
|
||||
Self {
|
||||
endpoint_id,
|
||||
lookup_type: LookupType::RoleSecret(role_name),
|
||||
}
|
||||
}
|
||||
pub(self) fn new_allowed_ips(endpoint_id: EndpointIdInt) -> Self {
|
||||
pub(self) fn new_allowed_ips(endpoint_id: EndpointId) -> Self {
|
||||
Self {
|
||||
endpoint_id,
|
||||
lookup_type: LookupType::AllowedIps,
|
||||
@@ -328,7 +320,7 @@ impl CachedLookupInfo {
|
||||
}
|
||||
|
||||
enum LookupType {
|
||||
RoleSecret(RoleNameInt),
|
||||
RoleSecret(RoleName),
|
||||
AllowedIps,
|
||||
}
|
||||
|
||||
@@ -343,7 +335,7 @@ impl Cache for ProjectInfoCacheImpl {
|
||||
match &key.lookup_type {
|
||||
LookupType::RoleSecret(role_name) => {
|
||||
if let Some(mut endpoint_info) = self.cache.get_mut(&key.endpoint_id) {
|
||||
endpoint_info.invalidate_role_secret(*role_name);
|
||||
endpoint_info.invalidate_role_secret(role_name);
|
||||
}
|
||||
}
|
||||
LookupType::AllowedIps => {
|
||||
@@ -465,7 +457,7 @@ mod tests {
|
||||
assert_eq!(cached.value, secret2);
|
||||
|
||||
// The only way to invalidate this value is to invalidate via the api.
|
||||
cache.invalidate_role_secret_for_project((&project_id).into(), (&user2).into());
|
||||
cache.invalidate_role_secret_for_project(&project_id, &user2);
|
||||
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
|
||||
|
||||
let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
|
||||
|
||||
@@ -315,11 +315,9 @@ async fn upload_parquet(
|
||||
FAILED_UPLOAD_MAX_RETRIES,
|
||||
"request_data_upload",
|
||||
// we don't want cancellation to interrupt here, so we make a dummy cancel token
|
||||
&CancellationToken::new(),
|
||||
backoff::Cancel::new(CancellationToken::new(), || anyhow::anyhow!("Cancelled")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("Cancelled"))
|
||||
.and_then(|x| x)
|
||||
.context("request_data_upload")?;
|
||||
|
||||
Ok(buffer.writer())
|
||||
|
||||
@@ -1,237 +0,0 @@
|
||||
use std::{
|
||||
hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock,
|
||||
};
|
||||
|
||||
use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
|
||||
use rustc_hash::FxHasher;
|
||||
|
||||
use crate::{BranchId, EndpointId, ProjectId, RoleName};
|
||||
|
||||
pub trait InternId: Sized + 'static {
|
||||
fn get_interner() -> &'static StringInterner<Self>;
|
||||
}
|
||||
|
||||
pub struct StringInterner<Id> {
|
||||
inner: ThreadedRodeo<Spur, BuildHasherDefault<FxHasher>>,
|
||||
_id: PhantomData<Id>,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Debug, Clone, Copy, Eq, Hash)]
|
||||
pub struct InternedString<Id> {
|
||||
inner: Spur,
|
||||
_id: PhantomData<Id>,
|
||||
}
|
||||
|
||||
impl<Id: InternId> std::fmt::Display for InternedString<Id> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
self.as_str().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: InternId> InternedString<Id> {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
Id::get_interner().inner.resolve(&self.inner)
|
||||
}
|
||||
pub fn get(s: &str) -> Option<Self> {
|
||||
Id::get_interner().get(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: InternId> AsRef<str> for InternedString<Id> {
|
||||
fn as_ref(&self) -> &str {
|
||||
self.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: InternId> std::ops::Deref for InternedString<Id> {
|
||||
type Target = str;
|
||||
fn deref(&self) -> &str {
|
||||
self.as_str()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
|
||||
fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
|
||||
struct Visitor<Id>(PhantomData<Id>);
|
||||
impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
|
||||
type Value = InternedString<Id>;
|
||||
|
||||
fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
formatter.write_str("a string")
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
|
||||
where
|
||||
E: serde::de::Error,
|
||||
{
|
||||
Ok(Id::get_interner().get_or_intern(v))
|
||||
}
|
||||
}
|
||||
d.deserialize_str(Visitor::<Id>(PhantomData))
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: InternId> serde::Serialize for InternedString<Id> {
|
||||
fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
|
||||
self.as_str().serialize(s)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: InternId> StringInterner<Id> {
|
||||
pub fn new() -> Self {
|
||||
StringInterner {
|
||||
inner: ThreadedRodeo::with_capacity_memory_limits_and_hasher(
|
||||
Capacity::new(2500, NonZeroUsize::new(1 << 16).unwrap()),
|
||||
// unbounded
|
||||
MemoryLimits::for_memory_usage(usize::MAX),
|
||||
BuildHasherDefault::<FxHasher>::default(),
|
||||
),
|
||||
_id: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.inner.is_empty()
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.inner.len()
|
||||
}
|
||||
|
||||
pub fn current_memory_usage(&self) -> usize {
|
||||
self.inner.current_memory_usage()
|
||||
}
|
||||
|
||||
pub fn get_or_intern(&self, s: &str) -> InternedString<Id> {
|
||||
InternedString {
|
||||
inner: self.inner.get_or_intern(s),
|
||||
_id: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, s: &str) -> Option<InternedString<Id>> {
|
||||
Some(InternedString {
|
||||
inner: self.inner.get(s)?,
|
||||
_id: PhantomData,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: InternId> Index<InternedString<Id>> for StringInterner<Id> {
|
||||
type Output = str;
|
||||
|
||||
fn index(&self, index: InternedString<Id>) -> &Self::Output {
|
||||
self.inner.resolve(&index.inner)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Id: InternId> Default for StringInterner<Id> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct RoleNameTag;
|
||||
impl InternId for RoleNameTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<RoleNameTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
pub type RoleNameInt = InternedString<RoleNameTag>;
|
||||
impl From<&RoleName> for RoleNameInt {
|
||||
fn from(value: &RoleName) -> Self {
|
||||
RoleNameTag::get_interner().get_or_intern(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct EndpointIdTag;
|
||||
impl InternId for EndpointIdTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<EndpointIdTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
pub type EndpointIdInt = InternedString<EndpointIdTag>;
|
||||
impl From<&EndpointId> for EndpointIdInt {
|
||||
fn from(value: &EndpointId) -> Self {
|
||||
EndpointIdTag::get_interner().get_or_intern(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct BranchIdTag;
|
||||
impl InternId for BranchIdTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<BranchIdTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
pub type BranchIdInt = InternedString<BranchIdTag>;
|
||||
impl From<&BranchId> for BranchIdInt {
|
||||
fn from(value: &BranchId) -> Self {
|
||||
BranchIdTag::get_interner().get_or_intern(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
|
||||
pub struct ProjectIdTag;
|
||||
impl InternId for ProjectIdTag {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<ProjectIdTag>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
pub type ProjectIdInt = InternedString<ProjectIdTag>;
|
||||
impl From<&ProjectId> for ProjectIdInt {
|
||||
fn from(value: &ProjectId) -> Self {
|
||||
ProjectIdTag::get_interner().get_or_intern(value)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::OnceLock;
|
||||
|
||||
use crate::intern::StringInterner;
|
||||
|
||||
use super::InternId;
|
||||
|
||||
struct MyId;
|
||||
impl InternId for MyId {
|
||||
fn get_interner() -> &'static StringInterner<Self> {
|
||||
pub static ROLE_NAMES: OnceLock<StringInterner<MyId>> = OnceLock::new();
|
||||
ROLE_NAMES.get_or_init(Default::default)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn push_many_strings() {
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use rand_distr::Zipf;
|
||||
|
||||
let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
|
||||
let endpoints = StdRng::seed_from_u64(272488357).sample_iter(endpoint_dist);
|
||||
|
||||
let interner = MyId::get_interner();
|
||||
|
||||
const N: usize = 100_000;
|
||||
let mut verify = Vec::with_capacity(N);
|
||||
for endpoint in endpoints.take(N) {
|
||||
let endpoint = format!("ep-string-interning-{endpoint}");
|
||||
let key = interner.get_or_intern(&endpoint);
|
||||
verify.push((endpoint, key));
|
||||
}
|
||||
|
||||
for (s, key) in verify {
|
||||
assert_eq!(interner[key], s);
|
||||
}
|
||||
|
||||
// 2031616/59861 = 34 bytes per string
|
||||
assert_eq!(interner.len(), 59_861);
|
||||
// will have other overhead for the internal hashmaps that are not accounted for.
|
||||
assert_eq!(interner.current_memory_usage(), 2_031_616);
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,6 @@ pub mod console;
|
||||
pub mod context;
|
||||
pub mod error;
|
||||
pub mod http;
|
||||
pub mod intern;
|
||||
pub mod jemalloc;
|
||||
pub mod logging;
|
||||
pub mod metrics;
|
||||
|
||||
@@ -4,10 +4,7 @@ use futures::StreamExt;
|
||||
use redis::aio::PubSub;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::{
|
||||
cache::project_info::ProjectInfoCache,
|
||||
intern::{ProjectIdInt, RoleNameInt},
|
||||
};
|
||||
use crate::{cache::project_info::ProjectInfoCache, ProjectId, RoleName};
|
||||
|
||||
const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
|
||||
const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
|
||||
@@ -48,12 +45,12 @@ enum Notification {
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct AllowedIpsUpdate {
|
||||
project_id: ProjectIdInt,
|
||||
project_id: ProjectId,
|
||||
}
|
||||
#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
|
||||
struct PasswordUpdate {
|
||||
project_id: ProjectIdInt,
|
||||
role_name: RoleNameInt,
|
||||
project_id: ProjectId,
|
||||
role_name: RoleName,
|
||||
}
|
||||
fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
|
||||
where
|
||||
@@ -68,11 +65,11 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
|
||||
use Notification::*;
|
||||
match msg {
|
||||
AllowedIpsUpdate { allowed_ips_update } => {
|
||||
cache.invalidate_allowed_ips_for_project(allowed_ips_update.project_id)
|
||||
cache.invalidate_allowed_ips_for_project(&allowed_ips_update.project_id)
|
||||
}
|
||||
PasswordUpdate { password_update } => cache.invalidate_role_secret_for_project(
|
||||
password_update.project_id,
|
||||
password_update.role_name,
|
||||
&password_update.project_id,
|
||||
&password_update.role_name,
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -144,14 +141,12 @@ where
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{ProjectId, RoleName};
|
||||
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn parse_allowed_ips() -> anyhow::Result<()> {
|
||||
let project_id: ProjectId = "new_project".into();
|
||||
let project_id = "new_project".to_string();
|
||||
let data = format!("{{\"project_id\": \"{project_id}\"}}");
|
||||
let text = json!({
|
||||
"type": "message",
|
||||
@@ -166,7 +161,7 @@ mod tests {
|
||||
result,
|
||||
Notification::AllowedIpsUpdate {
|
||||
allowed_ips_update: AllowedIpsUpdate {
|
||||
project_id: (&project_id).into()
|
||||
project_id: project_id.into()
|
||||
}
|
||||
}
|
||||
);
|
||||
@@ -176,8 +171,8 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_password_updated() -> anyhow::Result<()> {
|
||||
let project_id: ProjectId = "new_project".into();
|
||||
let role_name: RoleName = "new_role".into();
|
||||
let project_id = "new_project".to_string();
|
||||
let role_name = "new_role".to_string();
|
||||
let data = format!("{{\"project_id\": \"{project_id}\", \"role_name\": \"{role_name}\"}}");
|
||||
let text = json!({
|
||||
"type": "message",
|
||||
@@ -192,8 +187,8 @@ mod tests {
|
||||
result,
|
||||
Notification::PasswordUpdate {
|
||||
password_update: PasswordUpdate {
|
||||
project_id: (&project_id).into(),
|
||||
role_name: (&role_name).into(),
|
||||
project_id: project_id.into(),
|
||||
role_name: role_name.into()
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
@@ -558,17 +558,16 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let files = storage.list_files(Some(&remote_path)).await?;
|
||||
storage.delete_objects(&files).await
|
||||
storage.delete_objects(&files).await?;
|
||||
Ok(())
|
||||
},
|
||||
|_| false,
|
||||
3,
|
||||
10,
|
||||
"executing WAL segments deletion batch",
|
||||
&token,
|
||||
backoff::Cancel::new(token, || anyhow::anyhow!("canceled")),
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::anyhow!("canceled"))
|
||||
.and_then(|x| x)?;
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ BENCHMARKS_DURATION_QUERY = """
|
||||
FROM results
|
||||
WHERE
|
||||
started_at > CURRENT_DATE - INTERVAL '%s' day
|
||||
AND starts_with(parent_suite, 'test_runner.performance')
|
||||
AND parent_suite = 'test_runner.performance'
|
||||
AND status = 'passed'
|
||||
GROUP BY
|
||||
parent_suite, suite, name
|
||||
@@ -31,75 +31,68 @@ BENCHMARKS_DURATION_QUERY = """
|
||||
# the total duration varies from 8 to 40 minutes.
|
||||
# We use some pre-collected durations as a fallback to have a better distribution.
|
||||
FALLBACK_DURATION = {
|
||||
"test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-13-30]": 400.15,
|
||||
"test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[1-6-30]": 372.521,
|
||||
"test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-13-30]": 420.017,
|
||||
"test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[10-6-30]": 373.769,
|
||||
"test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-13-30]": 678.742,
|
||||
"test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py::test_pageserver_max_throughput_getpage_at_latest_lsn[100-6-30]": 512.135,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 58.036,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 22.104,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 126.073,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.759,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 6.885,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 8.758,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 18.275,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 9.533,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 12.09,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 35.145,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 22.28,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 66.353,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 75.487,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 54.142,
|
||||
"test_runner/performance/test_compaction.py::test_compaction": 110.715,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 11.68,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.384,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.315,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.783,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.647,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 17.04,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.01,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 11.902,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.077,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.4,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.33,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.434,
|
||||
"test_runner/performance/test_copy.py::test_copy[neon]": 13.817,
|
||||
"test_runner/performance/test_copy.py::test_copy[vanilla]": 11.736,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 575.735,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.868,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 14.393,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 20.588,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 30.849,
|
||||
"test_runner/performance/test_layer_map.py::test_layer_map": 39.378,
|
||||
"test_runner/performance/test_lazy_startup.py::test_lazy_startup": 2848.938,
|
||||
"test_runner/performance/test_logical_replication.py::test_logical_replication": 120.952,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 35.552,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 66.762,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 85.177,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 92.12,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 107.009,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 99.582,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 4.737,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 2.686,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.271,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 50.719,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 15.992,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.566,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 13.542,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.35,
|
||||
"test_runner/performance/test_startup.py::test_startup_simple": 13.043,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 194.841,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 286.667,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 85.577,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 297.626,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 646.187,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 989.776,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 125.638,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 123.554,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 190.083,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 21.016,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 23.028,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_heavy_write[20]": 62.144,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many[1024]": 90.941,
|
||||
"test_runner/performance/test_branch_creation.py::test_branch_creation_many_relations": 26.053,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_pgbench_perf": 25.67,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_read_perf": 14.497,
|
||||
"test_runner/performance/test_branching.py::test_compare_child_and_root_write_perf": 18.852,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[neon]": 26.572,
|
||||
"test_runner/performance/test_bulk_insert.py::test_bulk_insert[vanilla]": 6.259,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[10]": 21.206,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[1]": 3.474,
|
||||
"test_runner/performance/test_bulk_tenant_create.py::test_bulk_tenant_create[5]": 11.262,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[100]": 94.225,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[10]": 68.159,
|
||||
"test_runner/performance/test_bulk_update.py::test_bulk_update[50]": 76.719,
|
||||
"test_runner/performance/test_compaction.py::test_compaction": 110.222,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[neon-5-10-100]": 10.743,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_ro_with_pgbench_select_only[vanilla-5-10-100]": 16.541,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[neon-5-10-100]": 11.109,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_rw_with_pgbench_default[vanilla-5-10-100]": 18.121,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[neon-5-10-100]": 11.3,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wal_with_pgbench_default[vanilla-5-10-100]": 16.086,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-10]": 12.024,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[neon-10-1]": 11.14,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-10]": 10.375,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_heavy_write[vanilla-10-1]": 10.075,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[neon-5-10-100]": 11.147,
|
||||
"test_runner/performance/test_compare_pg_stats.py::test_compare_pg_stats_wo_with_pgbench_simple_update[vanilla-5-10-100]": 16.321,
|
||||
"test_runner/performance/test_copy.py::test_copy[neon]": 16.579,
|
||||
"test_runner/performance/test_copy.py::test_copy[vanilla]": 10.094,
|
||||
"test_runner/performance/test_gc_feedback.py::test_gc_feedback": 590.157,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[neon]": 14.102,
|
||||
"test_runner/performance/test_gist_build.py::test_gist_buffering_build[vanilla]": 8.677,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[neon-1]": 31.079,
|
||||
"test_runner/performance/test_latency.py::test_measure_read_latency_heavy_write_workload[vanilla-1]": 38.119,
|
||||
"test_runner/performance/test_layer_map.py::test_layer_map": 24.784,
|
||||
"test_runner/performance/test_logical_replication.py::test_logical_replication": 117.707,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[neon]": 21.194,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_different_tables[vanilla]": 59.068,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[neon]": 73.235,
|
||||
"test_runner/performance/test_parallel_copy_to.py::test_parallel_copy_same_table[vanilla]": 82.586,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[neon-45-10]": 106.536,
|
||||
"test_runner/performance/test_perf_pgbench.py::test_pgbench[vanilla-45-10]": 98.753,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[neon]": 6.975,
|
||||
"test_runner/performance/test_random_writes.py::test_random_writes[vanilla]": 3.69,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-100000-100-0]": 3.529,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-0]": 64.522,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[neon-10000000-1-4]": 40.964,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
|
||||
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
|
||||
"test_runner/performance/test_startup.py::test_startup_simple": 2.51,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[vanilla-10-5-5]": 113.653,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_off-1000]": 233.728,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[neon_on-1000]": 419.093,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_intensive_init_workload[vanilla-1000]": 982.461,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_off-45-100]": 116.522,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[neon_on-45-100]": 115.583,
|
||||
"test_runner/performance/test_wal_backpressure.py::test_pgbench_simple_update_workload[vanilla-45-100]": 155.282,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[neon]": 26.704,
|
||||
"test_runner/performance/test_write_amplification.py::test_write_amplification[vanilla]": 16.088,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -8,3 +8,17 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
echo "Uploading perf report to neon pg"
|
||||
# ingest per test results data into neon backed postgres running in staging to build grafana reports on that data
|
||||
DATABASE_URL="$PERF_TEST_RESULT_CONNSTR" poetry run python "$SCRIPT_DIR"/ingest_perf_test_result.py --ingest "$REPORT_FROM"
|
||||
|
||||
# Activate poetry's venv. Needed because git upload does not run in a project dir (it uses tmp to store the repository)
|
||||
# so the problem occurs because poetry cannot find pyproject.toml in temp dir created by git upload
|
||||
# shellcheck source=/dev/null
|
||||
. "$(poetry env info --path)"/bin/activate
|
||||
|
||||
echo "Uploading perf result to zenith-perf-data"
|
||||
scripts/git-upload \
|
||||
--repo=https://"$VIP_VAP_ACCESS_TOKEN"@github.com/neondatabase/zenith-perf-data.git \
|
||||
--message="add performance test result for $GITHUB_SHA neon revision" \
|
||||
--branch=master \
|
||||
copy "$REPORT_FROM" "data/$REPORT_TO" `# COPY FROM TO_RELATIVE`\
|
||||
--merge \
|
||||
--run-cmd "python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html"
|
||||
|
||||
219
scripts/generate_perf_report_page.py
Executable file
219
scripts/generate_perf_report_page.py
Executable file
@@ -0,0 +1,219 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, cast
|
||||
|
||||
from jinja2 import Template
|
||||
|
||||
# skip 'input' columns. They are included in the header and just blow the table
|
||||
EXCLUDE_COLUMNS = frozenset(
|
||||
{
|
||||
"scale",
|
||||
"duration",
|
||||
"number_of_clients",
|
||||
"number_of_threads",
|
||||
"init_start_timestamp",
|
||||
"init_end_timestamp",
|
||||
"run_start_timestamp",
|
||||
"run_end_timestamp",
|
||||
}
|
||||
)
|
||||
|
||||
KEY_EXCLUDE_FIELDS = frozenset(
|
||||
{
|
||||
"init_start_timestamp",
|
||||
"init_end_timestamp",
|
||||
"run_start_timestamp",
|
||||
"run_end_timestamp",
|
||||
}
|
||||
)
|
||||
NEGATIVE_COLOR = "negative"
|
||||
POSITIVE_COLOR = "positive"
|
||||
EPS = 1e-6
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuitRun:
|
||||
revision: str
|
||||
values: Dict[str, Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuitRuns:
|
||||
platform: str
|
||||
suit: str
|
||||
common_columns: List[Tuple[str, str]]
|
||||
value_columns: List[str]
|
||||
runs: List[SuitRun]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowValue:
|
||||
value: str
|
||||
color: str
|
||||
ratio: str
|
||||
|
||||
|
||||
def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
|
||||
value_columns = []
|
||||
common_columns = []
|
||||
for item in values:
|
||||
if item["name"] in KEY_EXCLUDE_FIELDS:
|
||||
continue
|
||||
if item["report"] != "test_param":
|
||||
value_columns.append(cast(str, item["name"]))
|
||||
else:
|
||||
common_columns.append((cast(str, item["name"]), cast(str, item["value"])))
|
||||
value_columns.sort()
|
||||
common_columns.sort(key=lambda x: x[0]) # sort by name
|
||||
return common_columns, value_columns
|
||||
|
||||
|
||||
def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
|
||||
color = ""
|
||||
sign = "+" if ratio > 0 else ""
|
||||
if abs(ratio) < 0.05:
|
||||
return f" ({sign}{ratio:.2f})", color
|
||||
|
||||
if report not in {"test_param", "higher_is_better", "lower_is_better"}:
|
||||
raise ValueError(f"Unknown report type: {report}")
|
||||
|
||||
if report == "test_param":
|
||||
return f"{ratio:.2f}", color
|
||||
|
||||
if ratio > 0:
|
||||
if report == "higher_is_better":
|
||||
color = POSITIVE_COLOR
|
||||
elif report == "lower_is_better":
|
||||
color = NEGATIVE_COLOR
|
||||
elif ratio < 0:
|
||||
if report == "higher_is_better":
|
||||
color = NEGATIVE_COLOR
|
||||
elif report == "lower_is_better":
|
||||
color = POSITIVE_COLOR
|
||||
|
||||
return f" ({sign}{ratio:.2f})", color
|
||||
|
||||
|
||||
def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
|
||||
for item in suit_run.values["data"]:
|
||||
if item["name"] == name:
|
||||
return cast(Dict[str, Any], item)
|
||||
return None
|
||||
|
||||
|
||||
def get_row_values(
|
||||
columns: List[str], run_result: SuitRun, prev_result: Optional[SuitRun]
|
||||
) -> List[RowValue]:
|
||||
row_values = []
|
||||
for column in columns:
|
||||
current_value = extract_value(column, run_result)
|
||||
if current_value is None:
|
||||
# should never happen
|
||||
raise ValueError(f"{column} not found in {run_result.values}")
|
||||
|
||||
value = current_value["value"]
|
||||
if isinstance(value, float):
|
||||
value = f"{value:.2f}"
|
||||
|
||||
if prev_result is None:
|
||||
row_values.append(RowValue(value, "", ""))
|
||||
continue
|
||||
|
||||
prev_value = extract_value(column, prev_result)
|
||||
if prev_value is None:
|
||||
# this might happen when new metric is added and there is no value for it in previous run
|
||||
# let this be here, TODO add proper handling when this actually happens
|
||||
raise ValueError(f"{column} not found in previous result")
|
||||
# adding `EPS` to each term to avoid ZeroDivisionError when the denominator is zero
|
||||
ratio = (float(value) + EPS) / (float(prev_value["value"]) + EPS) - 1
|
||||
ratio_display, color = format_ratio(ratio, current_value["report"])
|
||||
row_values.append(RowValue(value, color, ratio_display))
|
||||
return row_values
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuiteRunTableRow:
|
||||
revision: str
|
||||
values: List[RowValue]
|
||||
|
||||
|
||||
def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
|
||||
rows = []
|
||||
prev_run = None
|
||||
for run in runs:
|
||||
rows.append(
|
||||
SuiteRunTableRow(
|
||||
revision=run.revision, values=get_row_values(value_columns, run, prev_run)
|
||||
)
|
||||
)
|
||||
prev_run = run
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
input_dir = Path(args.input_dir)
|
||||
grouped_runs: Dict[str, SuitRuns] = {}
|
||||
# we have files in form: <ctr>_<rev>.json
|
||||
# fill them in the hashmap so we have grouped items for the
|
||||
# same run configuration (scale, duration etc.) ordered by counter.
|
||||
for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split("_")[0])):
|
||||
run_data = json.loads(item.read_text())
|
||||
revision = run_data["revision"]
|
||||
|
||||
for suit_result in run_data["result"]:
|
||||
key = "{}{}".format(run_data["platform"], suit_result["suit"])
|
||||
# pack total duration as a synthetic value
|
||||
total_duration = suit_result["total_duration"]
|
||||
suit_result["data"].append(
|
||||
{
|
||||
"name": "total_duration",
|
||||
"value": total_duration,
|
||||
"unit": "s",
|
||||
"report": "lower_is_better",
|
||||
}
|
||||
)
|
||||
common_columns, value_columns = get_columns(suit_result["data"])
|
||||
|
||||
grouped_runs.setdefault(
|
||||
key,
|
||||
SuitRuns(
|
||||
platform=run_data["platform"],
|
||||
suit=suit_result["suit"],
|
||||
common_columns=common_columns,
|
||||
value_columns=value_columns,
|
||||
runs=[],
|
||||
),
|
||||
)
|
||||
|
||||
grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
|
||||
context = {}
|
||||
for result in grouped_runs.values():
|
||||
suit = result.suit
|
||||
context[suit] = {
|
||||
"common_columns": result.common_columns,
|
||||
"value_columns": result.value_columns,
|
||||
"platform": result.platform,
|
||||
# reverse the order so newest results are on top of the table
|
||||
"rows": reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
|
||||
}
|
||||
|
||||
template = Template((Path(__file__).parent / "perf_report_template.html").read_text())
|
||||
|
||||
Path(args.out).write_text(template.render(context=context))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--input-dir",
|
||||
dest="input_dir",
|
||||
required=True,
|
||||
help="Directory with jsons generated by the test suite",
|
||||
)
|
||||
parser.add_argument("--out", required=True, help="Output html file path")
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
170
scripts/git-upload
Executable file
170
scripts/git-upload
Executable file
@@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shlex
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
from contextlib import contextmanager
|
||||
from distutils.dir_util import copy_tree
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import Optional
|
||||
|
||||
|
||||
def absolute_path(path):
|
||||
return Path(path).resolve()
|
||||
|
||||
|
||||
def relative_path(path):
|
||||
path = Path(path)
|
||||
if path.is_absolute():
|
||||
raise Exception(f'path `{path}` must be relative!')
|
||||
return path
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chdir(cwd: Path):
|
||||
old = os.getcwd()
|
||||
os.chdir(cwd)
|
||||
try:
|
||||
yield cwd
|
||||
finally:
|
||||
os.chdir(old)
|
||||
|
||||
|
||||
def run(cmd, *args, **kwargs):
|
||||
print('$', ' '.join(cmd))
|
||||
subprocess.check_call(cmd, *args, **kwargs)
|
||||
|
||||
|
||||
class GitRepo:
|
||||
def __init__(self, url, branch: Optional[str] = None):
|
||||
self.url = url
|
||||
self.cwd = TemporaryDirectory()
|
||||
self.branch = branch
|
||||
|
||||
args = [
|
||||
'git',
|
||||
'clone',
|
||||
'--single-branch',
|
||||
]
|
||||
if self.branch:
|
||||
args.extend(['--branch', self.branch])
|
||||
|
||||
subprocess.check_call([
|
||||
*args,
|
||||
str(url),
|
||||
self.cwd.name,
|
||||
])
|
||||
|
||||
def is_dirty(self):
|
||||
res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
|
||||
return bool(res)
|
||||
|
||||
def update(self, message, action, branch=None):
|
||||
with chdir(self.cwd.name):
|
||||
if not branch:
|
||||
cmd = ['git', 'branch', '--show-current']
|
||||
branch = subprocess.check_output(cmd, text=True).strip()
|
||||
|
||||
# Run action in repo's directory
|
||||
action()
|
||||
|
||||
run(['git', 'add', '.'])
|
||||
|
||||
if not self.is_dirty():
|
||||
print('No changes detected, quitting')
|
||||
return
|
||||
|
||||
git_with_user = [
|
||||
'git',
|
||||
'-c',
|
||||
'user.name=vipvap',
|
||||
'-c',
|
||||
'user.email=vipvap@zenith.tech',
|
||||
]
|
||||
run(git_with_user + [
|
||||
'commit',
|
||||
'--author="vipvap <vipvap@zenith.tech>"',
|
||||
f'--message={message}',
|
||||
])
|
||||
|
||||
for _ in range(5):
|
||||
try:
|
||||
run(['git', 'fetch', 'origin', branch])
|
||||
run(git_with_user + ['rebase', f'origin/{branch}'])
|
||||
run(['git', 'push', 'origin', branch])
|
||||
return
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
|
||||
|
||||
raise Exception(f'failed to update branch `{branch}`')
|
||||
|
||||
|
||||
def do_copy(args):
|
||||
src = args.src
|
||||
dst = args.dst
|
||||
|
||||
if args.forbid_overwrite and dst.exists():
|
||||
raise FileExistsError(f"File exists: '{dst}'")
|
||||
|
||||
if src.is_dir():
|
||||
if not args.merge:
|
||||
shutil.rmtree(dst, ignore_errors=True)
|
||||
# distutils is deprecated, but this is a temporary workaround before python version bump
|
||||
# here we need dir_exists_ok=True from shutil.copytree which is available in python 3.8+
|
||||
copy_tree(str(src), str(dst))
|
||||
else:
|
||||
shutil.copy(src, dst)
|
||||
|
||||
if args.run_cmd:
|
||||
run(shlex.split(args.run_cmd))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Git upload tool')
|
||||
parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
|
||||
parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
|
||||
parser.add_argument('--branch', type=str, metavar='TEXT', help='target git repo branch')
|
||||
|
||||
commands = parser.add_subparsers(title='commands', dest='subparser_name')
|
||||
|
||||
p_copy = commands.add_parser(
|
||||
'copy',
|
||||
help='copy file into the repo',
|
||||
formatter_class=argparse.RawTextHelpFormatter,
|
||||
)
|
||||
p_copy.add_argument('src', type=absolute_path, help='source path')
|
||||
p_copy.add_argument('dst', type=relative_path, help='relative dest path')
|
||||
p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
|
||||
p_copy.add_argument(
|
||||
'--merge',
|
||||
action='store_true',
|
||||
help='when copying a directory do not delete existing data, but add new files')
|
||||
p_copy.add_argument('--run-cmd',
|
||||
help=textwrap.dedent('''\
|
||||
run arbitrary cmd on top of copied files,
|
||||
example usage is static content generation
|
||||
based on current repository state\
|
||||
'''))
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
commands = {
|
||||
'copy': do_copy,
|
||||
}
|
||||
|
||||
action = commands.get(args.subparser_name)
|
||||
if action:
|
||||
message = args.message or 'update'
|
||||
GitRepo(args.repo, args.branch).update(message, lambda: action(args))
|
||||
else:
|
||||
parser.print_usage()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -96,5 +96,5 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
|
||||
"pageserver_evictions_total",
|
||||
"pageserver_evictions_with_low_residence_duration_total",
|
||||
*PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
|
||||
# "pageserver_broken_tenants_count" -- used only for broken
|
||||
# pageserver_broken_tenants_count is a leaked "metric" which is "cleared" on restart or reload
|
||||
)
|
||||
|
||||
@@ -1162,8 +1162,7 @@ class NeonEnv:
|
||||
to the attachment service.
|
||||
"""
|
||||
meta = self.attachment_service.inspect(tenant_id)
|
||||
if meta is None:
|
||||
return None
|
||||
assert meta is not None, f"{tenant_id} attachment location not found"
|
||||
pageserver_id = meta[1]
|
||||
return self.get_pageserver(pageserver_id)
|
||||
|
||||
|
||||
@@ -63,7 +63,8 @@ def test_lazy_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchm
|
||||
$$ LANGUAGE plpgsql
|
||||
"""
|
||||
)
|
||||
endpoint.safe_psql("SET statement_timeout=0; call updating()")
|
||||
endpoint.safe_psql("SET statement_timeout=0")
|
||||
endpoint.safe_psql("call updating()")
|
||||
|
||||
endpoint.stop()
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@ from fixtures.neon_fixtures import (
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import wait_for_upload_queue_empty
|
||||
from fixtures.remote_storage import RemoteStorageKind
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"
|
||||
@@ -194,10 +194,8 @@ class EvictionEnv:
|
||||
|
||||
# we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
|
||||
for tenant_id, timeline_id in self.timelines:
|
||||
tenant_ps = self.neon_env.get_tenant_pageserver(tenant_id)
|
||||
# Pageserver may be none if we are currently not attached anywhere, e.g. during secondary eviction test
|
||||
if tenant_ps is not None:
|
||||
tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
|
||||
pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
|
||||
pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
|
||||
|
||||
def statvfs_called():
|
||||
assert pageserver.log_contains(".*running mocked statvfs.*")
|
||||
@@ -866,18 +864,18 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
|
||||
|
||||
# Set up a situation where one pageserver _only_ has secondary locations on it,
|
||||
# so that when we release space we are sure it is via secondary locations.
|
||||
log.info("Setting up secondary locations...")
|
||||
|
||||
log.info("Setting up secondary location...")
|
||||
ps_attached = env.neon_env.pageservers[0]
|
||||
ps_secondary = env.neon_env.pageservers[1]
|
||||
for tenant_id in tenant_ids:
|
||||
# Find where it is attached
|
||||
pageserver = env.neon_env.get_tenant_pageserver(tenant_id)
|
||||
pageserver.http_client().tenant_heatmap_upload(tenant_id)
|
||||
# Migrate all attached tenants to the same pageserver, so that all the secondaries
|
||||
# will run on the other pageserver. This is necessary because when we create tenants,
|
||||
# they are spread over pageservers by default.
|
||||
env.neon_env.attachment_service.tenant_shard_migrate(
|
||||
TenantShardId(tenant_id, 0, 0), ps_attached.id
|
||||
)
|
||||
|
||||
# Detach it
|
||||
pageserver.tenant_detach(tenant_id)
|
||||
|
||||
# Create a secondary mode location for the tenant, all tenants on one pageserver that will only
|
||||
# contain secondary locations: this is the one where we will exercise disk usage eviction
|
||||
ps_secondary.tenant_location_configure(
|
||||
tenant_id,
|
||||
{
|
||||
@@ -889,8 +887,8 @@ def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
|
||||
readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
|
||||
log.info(f"Read back conf: {readback_conf}")
|
||||
|
||||
# Request secondary location to download all layers that the attached location indicated
|
||||
# in its heatmap
|
||||
# Request secondary location to download all layers that the attached location has
|
||||
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
# Configure the secondary pageserver to have a phony small disk size
|
||||
|
||||
@@ -35,11 +35,6 @@ def test_sharding_service_smoke(
|
||||
neon_env_builder.num_pageservers = 3
|
||||
env = neon_env_builder.init_configs()
|
||||
|
||||
for pageserver in env.pageservers:
|
||||
# This test detaches tenants during migration, which can race with deletion queue operations,
|
||||
# during detach we only do an advisory flush, we don't wait for it.
|
||||
pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
|
||||
|
||||
# Start services by hand so that we can skip a pageserver (this will start + register later)
|
||||
env.broker.try_start()
|
||||
env.attachment_service.start()
|
||||
@@ -145,13 +140,6 @@ def test_sharding_service_passthrough(
|
||||
timelines = client.timeline_list(tenant_id=env.initial_tenant)
|
||||
assert len(timelines) == 1
|
||||
|
||||
status = client.tenant_status(env.initial_tenant)
|
||||
assert TenantId(status["id"]) == env.initial_tenant
|
||||
assert set(TimelineId(t) for t in status["timelines"]) == {
|
||||
env.initial_timeline,
|
||||
}
|
||||
assert status["state"]["slug"] == "Active"
|
||||
|
||||
|
||||
def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
@@ -742,6 +742,8 @@ def ensure_test_data(data_id: int, data: str, endpoint: Endpoint):
|
||||
def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
client = env.pageserver.http_client()
|
||||
@@ -759,37 +761,56 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
|
||||
client.tenant_break(env.initial_tenant)
|
||||
|
||||
def found_broken():
|
||||
found_broken = False
|
||||
active, broken, broken_set = ([], [], [])
|
||||
for _ in range(10):
|
||||
m = client.get_metrics()
|
||||
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
|
||||
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
|
||||
found_broken = only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
|
||||
|
||||
wait_until(10, 0.5, found_broken)
|
||||
if found_broken:
|
||||
break
|
||||
log.info(f"active: {active}, broken: {broken}, broken_set: {broken_set}")
|
||||
time.sleep(0.5)
|
||||
assert (
|
||||
found_broken
|
||||
), f"tenant shows up as broken; active={active}, broken={broken}, broken_set={broken_set}"
|
||||
|
||||
client.tenant_ignore(env.initial_tenant)
|
||||
|
||||
def found_cleaned_up():
|
||||
found_broken = False
|
||||
broken, broken_set = ([], [])
|
||||
for _ in range(10):
|
||||
m = client.get_metrics()
|
||||
broken = m.query_all("pageserver_tenant_states_count", {"state": "Broken"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
assert only_int(broken) == 0 and len(broken_set) == 0
|
||||
found_broken = only_int(broken) == 0 and only_int(broken_set) == 1
|
||||
|
||||
wait_until(10, 0.5, found_cleaned_up)
|
||||
if found_broken:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
|
||||
|
||||
env.pageserver.tenant_load(env.initial_tenant)
|
||||
|
||||
def found_active():
|
||||
found_active = False
|
||||
active, broken_set = ([], [])
|
||||
for _ in range(10):
|
||||
m = client.get_metrics()
|
||||
active = m.query_all("pageserver_tenant_states_count", {"state": "Active"})
|
||||
broken_set = m.query_all(
|
||||
"pageserver_broken_tenants_count", {"tenant_id": str(env.initial_tenant)}
|
||||
)
|
||||
assert only_int(active) == 1 and len(broken_set) == 0
|
||||
found_active = only_int(active) == 1 and len(broken_set) == 0
|
||||
|
||||
wait_until(10, 0.5, found_active)
|
||||
if found_active:
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
|
||||
|
||||
@@ -883,7 +883,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
|
||||
# Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
|
||||
# logical size is paused in a failpoint. So instead we will use a log observation to check that
|
||||
# on-demand activation was triggered by the tenant deletion
|
||||
log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
|
||||
log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000}}: Activating tenant \\(on-demand\\).*"
|
||||
|
||||
def activated_on_demand():
|
||||
assert env.pageserver.log_contains(log_match) is not None
|
||||
|
||||
@@ -39,8 +39,7 @@ futures-io = { version = "0.3" }
|
||||
futures-sink = { version = "0.3" }
|
||||
futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
|
||||
getrandom = { version = "0.2", default-features = false, features = ["std"] }
|
||||
hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
|
||||
hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
|
||||
hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
|
||||
hex = { version = "0.4", features = ["serde"] }
|
||||
hmac = { version = "0.12", default-features = false, features = ["reset"] }
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
@@ -92,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
|
||||
either = { version = "1" }
|
||||
getrandom = { version = "0.2", default-features = false, features = ["std"] }
|
||||
hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
|
||||
hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
|
||||
indexmap = { version = "1", default-features = false, features = ["std"] }
|
||||
itertools = { version = "0.10" }
|
||||
libc = { version = "0.2", features = ["extra_traits", "use_std"] }
|
||||
|
||||
Reference in New Issue
Block a user