mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-14 08:00:38 +00:00
Compare commits
48 Commits
getpage_ls
...
jcsp/storc
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b0e3edda2e | ||
|
|
1117b0f429 | ||
|
|
c8379f0128 | ||
|
|
ee9ec26808 | ||
|
|
e22c072064 | ||
|
|
89f023e6b0 | ||
|
|
8426fb886b | ||
|
|
28e7fa98c4 | ||
|
|
a9fda8c832 | ||
|
|
fa12d60237 | ||
|
|
d551bfee09 | ||
|
|
e69ff3fc00 | ||
|
|
25d9dc6eaf | ||
|
|
139d1346d5 | ||
|
|
0bd16182f7 | ||
|
|
6a5650d40c | ||
|
|
47addc15f1 | ||
|
|
b91c58a8bf | ||
|
|
00d9c2d9a8 | ||
|
|
3a673dce67 | ||
|
|
35e9fb360b | ||
|
|
0d21187322 | ||
|
|
e8a98adcd0 | ||
|
|
98be8b9430 | ||
|
|
6eb946e2de | ||
|
|
681a04d287 | ||
|
|
3df67bf4d7 | ||
|
|
0d8e68003a | ||
|
|
637ad4a638 | ||
|
|
8d0f701767 | ||
|
|
5191f6ef0e | ||
|
|
a54ea8fb1c | ||
|
|
d5708e7435 | ||
|
|
fd49005cb3 | ||
|
|
3023de156e | ||
|
|
e49e931bc4 | ||
|
|
13b9135d4e | ||
|
|
41bb1e42b8 | ||
|
|
cb4b40f9c1 | ||
|
|
9e567d9814 | ||
|
|
1c012958c7 | ||
|
|
e5c50bb12b | ||
|
|
926662eb7c | ||
|
|
3366cd34ba | ||
|
|
2d5a8462c8 | ||
|
|
110282ee7e | ||
|
|
f752c40f58 | ||
|
|
83cdbbb89a |
8
.github/workflows/build_and_test.yml
vendored
8
.github/workflows/build_and_test.yml
vendored
@@ -735,7 +735,7 @@ jobs:
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
@@ -792,7 +792,7 @@ jobs:
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
- uses: docker/setup-buildx-action@v2
|
||||
with:
|
||||
# Disable parallelism for docker buildkit.
|
||||
# As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
|
||||
@@ -865,7 +865,7 @@ jobs:
|
||||
run:
|
||||
shell: sh -eu {0}
|
||||
env:
|
||||
VM_BUILDER_VERSION: v0.23.2
|
||||
VM_BUILDER_VERSION: v0.28.1
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -1133,8 +1133,6 @@ jobs:
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
|
||||
@@ -28,7 +28,9 @@ jobs:
|
||||
- name: Get build-tools image tag for the current commit
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
|
||||
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
LAST_BUILD_TOOLS_SHA=$(
|
||||
|
||||
37
Cargo.lock
generated
37
Cargo.lock
generated
@@ -599,7 +599,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"rustls 0.21.9",
|
||||
"rustls 0.21.11",
|
||||
"tokio",
|
||||
"tracing",
|
||||
]
|
||||
@@ -2519,7 +2519,7 @@ dependencies = [
|
||||
"http 0.2.9",
|
||||
"hyper 0.14.26",
|
||||
"log",
|
||||
"rustls 0.21.9",
|
||||
"rustls 0.21.11",
|
||||
"rustls-native-certs 0.6.2",
|
||||
"tokio",
|
||||
"tokio-rustls 0.24.0",
|
||||
@@ -3658,6 +3658,7 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"twox-hash",
|
||||
"url",
|
||||
"utils",
|
||||
"walkdir",
|
||||
@@ -4059,7 +4060,7 @@ dependencies = [
|
||||
"futures",
|
||||
"once_cell",
|
||||
"pq_proto",
|
||||
"rustls 0.22.2",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"serde",
|
||||
"thiserror",
|
||||
@@ -4350,7 +4351,7 @@ dependencies = [
|
||||
"routerify",
|
||||
"rstest",
|
||||
"rustc-hash",
|
||||
"rustls 0.22.2",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
@@ -4542,7 +4543,7 @@ dependencies = [
|
||||
"itoa",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.22.2",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
@@ -4696,7 +4697,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.21.9",
|
||||
"rustls 0.21.11",
|
||||
"rustls-pemfile 1.0.2",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -4956,9 +4957,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.21.9"
|
||||
version = "0.21.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
|
||||
checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
|
||||
dependencies = [
|
||||
"log",
|
||||
"ring 0.17.6",
|
||||
@@ -4968,9 +4969,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.22.2"
|
||||
version = "0.22.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
|
||||
checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
|
||||
dependencies = [
|
||||
"log",
|
||||
"ring 0.17.6",
|
||||
@@ -5282,7 +5283,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
|
||||
dependencies = [
|
||||
"httpdate",
|
||||
"reqwest",
|
||||
"rustls 0.21.9",
|
||||
"rustls 0.21.11",
|
||||
"sentry-backtrace",
|
||||
"sentry-contexts",
|
||||
"sentry-core",
|
||||
@@ -5773,6 +5774,7 @@ dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
"comfy-table",
|
||||
"humantime",
|
||||
"hyper 0.14.26",
|
||||
"pageserver_api",
|
||||
"pageserver_client",
|
||||
@@ -5830,8 +5832,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
|
||||
[[package]]
|
||||
name = "svg_fmt"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
|
||||
source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
@@ -6193,7 +6194,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"ring 0.17.6",
|
||||
"rustls 0.22.2",
|
||||
"rustls 0.22.4",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.25.0",
|
||||
@@ -6206,7 +6207,7 @@ version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
|
||||
dependencies = [
|
||||
"rustls 0.21.9",
|
||||
"rustls 0.21.11",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@@ -6216,7 +6217,7 @@ version = "0.25.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
|
||||
dependencies = [
|
||||
"rustls 0.22.2",
|
||||
"rustls 0.22.4",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
]
|
||||
@@ -6677,7 +6678,7 @@ dependencies = [
|
||||
"base64 0.21.1",
|
||||
"log",
|
||||
"once_cell",
|
||||
"rustls 0.21.9",
|
||||
"rustls 0.21.11",
|
||||
"rustls-webpki 0.100.2",
|
||||
"url",
|
||||
"webpki-roots 0.23.1",
|
||||
@@ -7354,7 +7355,7 @@ dependencies = [
|
||||
"regex-automata 0.4.3",
|
||||
"regex-syntax 0.8.2",
|
||||
"reqwest",
|
||||
"rustls 0.21.9",
|
||||
"rustls 0.21.11",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
||||
@@ -157,7 +157,8 @@ socket2 = "0.5"
|
||||
strum = "0.24"
|
||||
strum_macros = "0.24"
|
||||
"subtle" = "2.5.0"
|
||||
svg_fmt = "0.4.1"
|
||||
# https://github.com/nical/rust_debug/pull/4
|
||||
svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
|
||||
sync_wrapper = "0.1.2"
|
||||
tar = "0.4"
|
||||
task-local-extensions = "0.1.4"
|
||||
|
||||
@@ -818,9 +818,15 @@ impl ComputeNode {
|
||||
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
||||
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
||||
// Disable forwarding so that users don't get a cloud_admin role
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
|
||||
let mut func = || {
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("apply_config setup cloud_admin")?;
|
||||
|
||||
drop(client);
|
||||
|
||||
// reconnect with connstring with expected name
|
||||
@@ -832,24 +838,29 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client
|
||||
.simple_query("SET neon.forward_ddl = false")
|
||||
.context("apply_config SET neon.forward_ddl = false")?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||
create_neon_superuser(spec, &mut client)?;
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
|
||||
cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
|
||||
handle_roles(spec, &mut client).context("apply_config handle_roles")?;
|
||||
handle_databases(spec, &mut client).context("apply_config handle_databases")?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)
|
||||
.context("apply_config handle_role_deletions")?;
|
||||
handle_grants(
|
||||
spec,
|
||||
&mut client,
|
||||
connstr.as_str(),
|
||||
self.has_feature(ComputeFeature::AnonExtension),
|
||||
)?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
create_availability_check_data(&mut client)?;
|
||||
)
|
||||
.context("apply_config handle_grants")?;
|
||||
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
|
||||
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
|
||||
create_availability_check_data(&mut client)
|
||||
.context("apply_config create_availability_check_data")?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
@@ -857,7 +868,7 @@ impl ComputeNode {
|
||||
// Run migrations separately to not hold up cold starts
|
||||
thread::spawn(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client)
|
||||
handle_migrations(&mut client).context("apply_config handle_migrations")
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use postgres::config::Config;
|
||||
use postgres::{Client, NoTls};
|
||||
use reqwest::StatusCode;
|
||||
@@ -698,7 +698,8 @@ pub fn handle_grants(
|
||||
|
||||
// it is important to run this after all grants
|
||||
if enable_anon_extension {
|
||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
|
||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)
|
||||
.context("handle_grants handle_extension_anon")?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -813,28 +814,36 @@ $$;"#,
|
||||
// Add new migrations below.
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
let mut func = || {
|
||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("handle_migrations prepare")?;
|
||||
|
||||
query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client.query_one(query, &[])?;
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client
|
||||
.query_one(query, &[])
|
||||
.context("handle_migrations get migration_id")?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
query = "BEGIN";
|
||||
client.simple_query(query)?;
|
||||
let query = "BEGIN";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations begin")?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
let migration = &migrations[current_migration];
|
||||
@@ -842,7 +851,9 @@ $$;"#,
|
||||
info!("Skip migration id={}", current_migration);
|
||||
} else {
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
client.simple_query(migration)?;
|
||||
client.simple_query(migration).with_context(|| {
|
||||
format!("handle_migrations current_migration={}", current_migration)
|
||||
})?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
@@ -850,10 +861,14 @@ $$;"#,
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client.simple_query(&setval)?;
|
||||
client
|
||||
.simple_query(&setval)
|
||||
.context("handle_migrations update id")?;
|
||||
|
||||
query = "COMMIT";
|
||||
client.simple_query(query)?;
|
||||
let query = "COMMIT";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations commit")?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
|
||||
@@ -1417,6 +1417,7 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("timeline")
|
||||
.about("Manage timelines")
|
||||
.arg_required_else_help(true)
|
||||
.subcommand(Command::new("list")
|
||||
.about("List all timelines, available to this pageserver")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
|
||||
@@ -156,6 +156,7 @@ pub struct SafekeeperConf {
|
||||
pub remote_storage: Option<String>,
|
||||
pub backup_threads: Option<u32>,
|
||||
pub auth_enabled: bool,
|
||||
pub listen_addr: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for SafekeeperConf {
|
||||
@@ -169,6 +170,7 @@ impl Default for SafekeeperConf {
|
||||
remote_storage: None,
|
||||
backup_threads: None,
|
||||
auth_enabled: false,
|
||||
listen_addr: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: reqwest::Client,
|
||||
pub listen_addr: String,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
|
||||
listen_addr.clone()
|
||||
} else {
|
||||
"127.0.0.1".to_string()
|
||||
};
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
||||
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: reqwest::Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
|
||||
listen_addr,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to this safekeeper.
|
||||
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
||||
PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
|
||||
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
|
||||
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
|
||||
}
|
||||
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||
@@ -111,8 +118,8 @@ impl SafekeeperNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
||||
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
||||
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
|
||||
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
|
||||
let id = self.id;
|
||||
let datadir = self.datadir_path();
|
||||
|
||||
@@ -139,7 +146,7 @@ impl SafekeeperNode {
|
||||
availability_zone,
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
||||
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
|
||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||
}
|
||||
if !self.conf.sync {
|
||||
|
||||
@@ -9,6 +9,7 @@ license.workspace = true
|
||||
anyhow.workspace = true
|
||||
clap.workspace = true
|
||||
comfy-table.workspace = true
|
||||
humantime.workspace = true
|
||||
hyper.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
pageserver_client.workspace = true
|
||||
|
||||
@@ -1,13 +1,14 @@
|
||||
use std::{collections::HashMap, str::FromStr};
|
||||
use std::{collections::HashMap, str::FromStr, time::Duration};
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use hyper::Method;
|
||||
use hyper::{Method, StatusCode};
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
|
||||
TenantDescribeResponse, TenantPolicyRequest,
|
||||
},
|
||||
models::{
|
||||
EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
|
||||
ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
|
||||
TenantShardSplitRequest, TenantShardSplitResponse,
|
||||
},
|
||||
@@ -120,6 +121,30 @@ enum Command {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
|
||||
/// mode so that it can warm up content on a pageserver.
|
||||
TenantWarmup {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
/// Uncleanly drop a tenant from the storage controller: this doesn't delete anything from pageservers. Appropriate
|
||||
/// if you e.g. used `tenant-warmup` by mistake on a tenant ID that doesn't really exist, or is in some other region.
|
||||
TenantDrop {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
PageserverEnableHeatmaps {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
},
|
||||
TenantSetTimeBasedEviction {
|
||||
#[arg(long)]
|
||||
tenant_id: TenantId,
|
||||
#[arg(long)]
|
||||
period: humantime::Duration,
|
||||
#[arg(long)]
|
||||
threshold: humantime::Duration,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Parser)]
|
||||
@@ -581,6 +606,172 @@ async fn main() -> anyhow::Result<()> {
|
||||
}
|
||||
println!("{table}");
|
||||
}
|
||||
Command::TenantWarmup { tenant_id } => {
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
match describe_response {
|
||||
Ok(describe) => {
|
||||
if matches!(describe.policy, PlacementPolicy::Secondary) {
|
||||
// Fine: it's already known to controller in secondary mode: calling
|
||||
// again to put it into secondary mode won't cause problems.
|
||||
} else {
|
||||
anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
|
||||
}
|
||||
}
|
||||
Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
|
||||
// Fine: this tenant isn't know to the storage controller yet.
|
||||
}
|
||||
Err(e) => {
|
||||
// Unexpected API error
|
||||
return Err(e.into());
|
||||
}
|
||||
}
|
||||
|
||||
vps_client
|
||||
.location_config(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
pageserver_api::models::LocationConfig {
|
||||
mode: pageserver_api::models::LocationConfigMode::Secondary,
|
||||
generation: None,
|
||||
secondary_conf: Some(LocationConfigSecondary { warm: true }),
|
||||
shard_number: 0,
|
||||
shard_count: 0,
|
||||
shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
|
||||
tenant_conf: TenantConfig::default(),
|
||||
},
|
||||
None,
|
||||
true,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let describe_response = storcon_client
|
||||
.dispatch::<(), TenantDescribeResponse>(
|
||||
Method::GET,
|
||||
format!("control/v1/tenant/{tenant_id}"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let secondary_ps_id = describe_response
|
||||
.shards
|
||||
.first()
|
||||
.unwrap()
|
||||
.node_secondary
|
||||
.first()
|
||||
.unwrap();
|
||||
|
||||
println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
|
||||
loop {
|
||||
let (status, progress) = vps_client
|
||||
.tenant_secondary_download(
|
||||
TenantShardId::unsharded(tenant_id),
|
||||
Some(Duration::from_secs(10)),
|
||||
)
|
||||
.await?;
|
||||
println!(
|
||||
"Progress: {}/{} layers, {}/{} bytes",
|
||||
progress.layers_downloaded,
|
||||
progress.layers_total,
|
||||
progress.bytes_downloaded,
|
||||
progress.bytes_total
|
||||
);
|
||||
match status {
|
||||
StatusCode::OK => {
|
||||
println!("Download complete");
|
||||
break;
|
||||
}
|
||||
StatusCode::ACCEPTED => {
|
||||
// Loop
|
||||
}
|
||||
_ => {
|
||||
anyhow::bail!("Unexpected download status: {status}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Command::TenantDrop { tenant_id } => {
|
||||
storcon_client
|
||||
.dispatch::<(), ()>(
|
||||
Method::POST,
|
||||
format!("debug/v1/tenant/{tenant_id}/drop"),
|
||||
None,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
Command::PageserverEnableHeatmaps { tenant_id } => {
|
||||
vps_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: TenantConfig {
|
||||
checkpoint_distance: None,
|
||||
checkpoint_timeout: None,
|
||||
compaction_target_size: None,
|
||||
compaction_period: None,
|
||||
compaction_threshold: None,
|
||||
compaction_algorithm: None,
|
||||
gc_horizon: None,
|
||||
gc_period: None,
|
||||
image_creation_threshold: None,
|
||||
pitr_interval: None,
|
||||
walreceiver_connect_timeout: None,
|
||||
lagging_wal_timeout: None,
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
eviction_policy: None,
|
||||
min_resident_size_override: None,
|
||||
evictions_low_residence_duration_metric_threshold: None,
|
||||
heatmap_period: Some("60s".to_string()),
|
||||
lazy_slru_download: None,
|
||||
timeline_get_throttle: None,
|
||||
image_layer_creation_check_threshold: None,
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
Command::TenantSetTimeBasedEviction {
|
||||
tenant_id,
|
||||
period,
|
||||
threshold,
|
||||
} => {
|
||||
vps_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
config: TenantConfig {
|
||||
checkpoint_distance: None,
|
||||
checkpoint_timeout: None,
|
||||
compaction_target_size: None,
|
||||
compaction_period: None,
|
||||
compaction_threshold: None,
|
||||
compaction_algorithm: None,
|
||||
gc_horizon: None,
|
||||
gc_period: None,
|
||||
image_creation_threshold: None,
|
||||
pitr_interval: None,
|
||||
walreceiver_connect_timeout: None,
|
||||
lagging_wal_timeout: None,
|
||||
max_lsn_wal_lag: None,
|
||||
trace_read_requests: None,
|
||||
eviction_policy: Some(EvictionPolicy::LayerAccessThreshold(
|
||||
EvictionPolicyLayerAccessThreshold {
|
||||
period: period.into(),
|
||||
threshold: threshold.into(),
|
||||
},
|
||||
)),
|
||||
min_resident_size_override: None,
|
||||
evictions_low_residence_duration_metric_threshold: None,
|
||||
heatmap_period: None,
|
||||
lazy_slru_download: None,
|
||||
timeline_get_throttle: None,
|
||||
image_layer_creation_check_threshold: None,
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
150
docs/storage_controller.md
Normal file
150
docs/storage_controller.md
Normal file
@@ -0,0 +1,150 @@
|
||||
# Storage Controller
|
||||
|
||||
## Concepts
|
||||
|
||||
The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
|
||||
which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
|
||||
|
||||
It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
|
||||
the underlying details of how data is spread across multiple nodes.
|
||||
|
||||
The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
|
||||
|
||||
## APIs
|
||||
|
||||
The storage controller’s HTTP server implements four logically separate APIs:
|
||||
|
||||
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
|
||||
- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
|
||||
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
|
||||
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
|
||||
to ensure data safety with generation numbers.
|
||||
|
||||
The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
|
||||
|
||||
See the `http.rs` file in the source for where the HTTP APIs are implemented.
|
||||
|
||||
## Database
|
||||
|
||||
The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
|
||||
persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
|
||||
rebuilt on startup.
|
||||
|
||||
The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
|
||||
|
||||
The `diesel` crate is used for defining models & migrations.
|
||||
|
||||
Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
|
||||
|
||||
### Diesel tip: migrations
|
||||
|
||||
If you need to modify the database schema, here’s how to create a migration:
|
||||
|
||||
- Install the diesel CLI with `cargo install diesel_cli`
|
||||
- Use `diesel migration generate <name>` to create a new migration
|
||||
- Populate the SQL files in the `migrations/` subdirectory
|
||||
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
|
||||
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
|
||||
- Commit the migration files and the changes to schema.rs
|
||||
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
|
||||
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
|
||||
|
||||
## storcon_cli
|
||||
|
||||
The `storcon_cli` tool enables interactive management of the storage controller. This is usually
|
||||
only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
|
||||
|
||||
`storcon_cli --help` includes details on commands.
|
||||
|
||||
# Deploying
|
||||
|
||||
This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
|
||||
part of a self-hosted system.
|
||||
|
||||
_General note: since the default `neon_local` environment includes a storage controller, this is a useful
|
||||
reference when figuring out deployment._
|
||||
|
||||
## Database
|
||||
|
||||
It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
|
||||
local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
|
||||
|
||||
The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
|
||||
|
||||
Set the URL to the database using the `--database-url` CLI option.
|
||||
|
||||
There is no need to run migrations manually: the storage controller automatically applies migrations
|
||||
when it starts up.
|
||||
|
||||
## Configure pageservers to use the storage controller
|
||||
|
||||
1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
|
||||
point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
|
||||
2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
|
||||
with the storage controller when it starts up. See the example below for the format of this file.
|
||||
|
||||
### Example `metadata.json`
|
||||
|
||||
```
|
||||
{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
|
||||
```
|
||||
|
||||
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
|
||||
postgres runs.
|
||||
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
|
||||
the storage controller runs.
|
||||
|
||||
## Handle compute notifications.
|
||||
|
||||
The storage controller independently moves tenant attachments between pageservers in response to
|
||||
changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
|
||||
postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
|
||||
location changes.
|
||||
|
||||
The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
|
||||
JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
|
||||
|
||||
In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
|
||||
the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
|
||||
the compute hook.
|
||||
|
||||
When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
|
||||
the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
|
||||
|
||||
```
|
||||
struct ComputeHookNotifyRequestShard {
|
||||
node_id: NodeId,
|
||||
shard_number: ShardNumber,
|
||||
}
|
||||
|
||||
struct ComputeHookNotifyRequest {
|
||||
tenant_id: TenantId,
|
||||
stripe_size: Option<ShardStripeSize>,
|
||||
shards: Vec<ComputeHookNotifyRequestShard>,
|
||||
}
|
||||
```
|
||||
|
||||
When a notification is received:
|
||||
|
||||
1. Modify postgres configuration for this tenant:
|
||||
|
||||
- set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
|
||||
shards identified by `NodeId` must be converted to the address+port of the node.
|
||||
- if stripe_size is not None, set `neon.stripe_size` to this value
|
||||
|
||||
2. Send SIGHUP to postgres to reload configuration
|
||||
3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
|
||||
will retry the notification until it succeeds..
|
||||
|
||||
### Example notification body
|
||||
|
||||
```
|
||||
{
|
||||
"tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
|
||||
"stripe_size": 32768,
|
||||
"shards": [
|
||||
{"node_id": 344, "shard_number": 0},
|
||||
{"node_id": 722, "shard_number": 1},
|
||||
],
|
||||
}
|
||||
```
|
||||
@@ -1,8 +1,10 @@
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use bytes::BufMut;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::RangeInclusive;
|
||||
use std::{fmt, ops::Range};
|
||||
|
||||
use crate::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
@@ -21,9 +23,81 @@ pub struct Key {
|
||||
pub field6: u32,
|
||||
}
|
||||
|
||||
/// The storage key size.
|
||||
pub const KEY_SIZE: usize = 18;
|
||||
|
||||
/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
|
||||
/// See [`Key::to_i128`] for more information on the encoding.
|
||||
pub const METADATA_KEY_SIZE: usize = 16;
|
||||
|
||||
/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
|
||||
pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
|
||||
|
||||
/// The (reserved) key prefix of relation sizes.
|
||||
pub const RELATION_SIZE_PREFIX: u8 = 0x81;
|
||||
|
||||
/// The key prefix of AUX file keys.
|
||||
pub const AUX_KEY_PREFIX: u8 = 0x82;
|
||||
|
||||
/// Check if the key falls in the range of metadata keys.
|
||||
pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
|
||||
key[0] >= METADATA_KEY_BEGIN_PREFIX
|
||||
}
|
||||
|
||||
impl Key {
|
||||
/// Check if the key falls in the range of metadata keys.
|
||||
pub const fn is_metadata_key(&self) -> bool {
|
||||
self.field1 >= METADATA_KEY_BEGIN_PREFIX
|
||||
}
|
||||
|
||||
/// Encode a metadata key to a storage key.
|
||||
pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
|
||||
assert!(is_metadata_key_slice(key), "key not in metadata key range");
|
||||
Key {
|
||||
field1: key[0],
|
||||
field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
|
||||
field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
|
||||
field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
|
||||
field5: key[11],
|
||||
field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode a metadata key to a storage key.
|
||||
pub fn from_metadata_key(key: &[u8]) -> Self {
|
||||
Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
|
||||
}
|
||||
|
||||
/// Extract a metadata key to a writer. The result should always be 16 bytes.
|
||||
pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
|
||||
writer.put_u8(self.field1);
|
||||
assert!(self.field2 <= 0xFFFF);
|
||||
writer.put_u16(self.field2 as u16);
|
||||
writer.put_u32(self.field3);
|
||||
writer.put_u32(self.field4);
|
||||
writer.put_u8(self.field5);
|
||||
writer.put_u32(self.field6);
|
||||
}
|
||||
|
||||
/// Get the range of metadata keys.
|
||||
pub fn metadata_key_range() -> RangeInclusive<Self> {
|
||||
Key {
|
||||
field1: METADATA_KEY_BEGIN_PREFIX,
|
||||
field2: 0,
|
||||
field3: 0,
|
||||
field4: 0,
|
||||
field5: 0,
|
||||
field6: 0,
|
||||
}..=Key {
|
||||
field1: u8::MAX,
|
||||
field2: u16::MAX as u32,
|
||||
field3: u32::MAX,
|
||||
field4: u32::MAX,
|
||||
field5: u8::MAX,
|
||||
field6: u32::MAX,
|
||||
}
|
||||
}
|
||||
|
||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
@@ -48,11 +122,11 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn next(&self) -> Key {
|
||||
pub const fn next(&self) -> Key {
|
||||
self.add(1)
|
||||
}
|
||||
|
||||
pub fn add(&self, x: u32) -> Key {
|
||||
pub const fn add(&self, x: u32) -> Key {
|
||||
let mut key = *self;
|
||||
|
||||
let r = key.field6.overflowing_add(x);
|
||||
@@ -81,6 +155,8 @@ impl Key {
|
||||
key
|
||||
}
|
||||
|
||||
/// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::from_metadata_key`] instead.
|
||||
pub fn from_slice(b: &[u8]) -> Self {
|
||||
Key {
|
||||
field1: b[0],
|
||||
@@ -92,6 +168,8 @@ impl Key {
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
|
||||
/// Use [`Key::extract_metadata_key_to_writer`] instead.
|
||||
pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
|
||||
buf[0] = self.field1;
|
||||
BE::write_u32(&mut buf[1..5], self.field2);
|
||||
@@ -475,12 +553,14 @@ pub const AUX_FILES_KEY: Key = Key {
|
||||
// Reverse mappings for a few Keys.
|
||||
// These are needed by WAL redo manager.
|
||||
|
||||
pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
|
||||
|
||||
// AUX_FILES currently stores only data for logical replication (slots etc), and
|
||||
// we don't preserve these on a branch because safekeepers can't follow timeline
|
||||
// switch (and generally it likely should be optional), so ignore these.
|
||||
#[inline(always)]
|
||||
pub fn is_inherited_key(key: Key) -> bool {
|
||||
key != AUX_FILES_KEY
|
||||
!NON_INHERITED_RANGE.contains(&key)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
@@ -556,11 +636,14 @@ impl std::str::FromStr for Key {
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
use crate::key::is_metadata_key_slice;
|
||||
use crate::key::Key;
|
||||
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
|
||||
use super::AUX_KEY_PREFIX;
|
||||
|
||||
#[test]
|
||||
fn display_fromstr_bijection() {
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
@@ -576,4 +659,16 @@ mod tests {
|
||||
|
||||
assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_metadata_keys() {
|
||||
let mut metadata_key = vec![AUX_KEY_PREFIX];
|
||||
metadata_key.extend_from_slice(&[0xFF; 15]);
|
||||
let encoded_key = Key::from_metadata_key(&metadata_key);
|
||||
let mut output_key = Vec::new();
|
||||
encoded_key.extract_metadata_key_to_writer(&mut output_key);
|
||||
assert_eq!(metadata_key, output_key);
|
||||
assert!(encoded_key.is_metadata_key());
|
||||
assert!(is_metadata_key_slice(&metadata_key));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,12 +94,13 @@ impl KeySpace {
|
||||
|
||||
/// Remove all keys in `other` from `self`.
|
||||
/// This can involve splitting or removing of existing ranges.
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
|
||||
/// Returns the removed keyspace
|
||||
pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
|
||||
let (self_start, self_end) = match (self.start(), self.end()) {
|
||||
(Some(start), Some(end)) => (start, end),
|
||||
_ => {
|
||||
// self is empty
|
||||
return;
|
||||
return KeySpace::default();
|
||||
}
|
||||
};
|
||||
|
||||
@@ -112,30 +113,37 @@ impl KeySpace {
|
||||
.skip_while(|range| self_start >= range.end)
|
||||
.take_while(|range| self_end > range.start);
|
||||
|
||||
let mut removed_accum = KeySpaceRandomAccum::new();
|
||||
for range in other_ranges {
|
||||
while let Some(overlap_at) = self.overlaps_at(range) {
|
||||
let overlapped = self.ranges[overlap_at].clone();
|
||||
|
||||
if overlapped.start < range.start && overlapped.end <= range.end {
|
||||
// Higher part of the range is completely overlapped.
|
||||
removed_accum.add_range(range.start..self.ranges[overlap_at].end);
|
||||
self.ranges[overlap_at].end = range.start;
|
||||
}
|
||||
if overlapped.start >= range.start && overlapped.end > range.end {
|
||||
// Lower part of the range is completely overlapped.
|
||||
removed_accum.add_range(self.ranges[overlap_at].start..range.end);
|
||||
self.ranges[overlap_at].start = range.end;
|
||||
}
|
||||
if overlapped.start < range.start && overlapped.end > range.end {
|
||||
// Middle part of the range is overlapped.
|
||||
removed_accum.add_range(range.clone());
|
||||
self.ranges[overlap_at].end = range.start;
|
||||
self.ranges
|
||||
.insert(overlap_at + 1, range.end..overlapped.end);
|
||||
}
|
||||
if overlapped.start >= range.start && overlapped.end <= range.end {
|
||||
// Whole range is overlapped
|
||||
removed_accum.add_range(self.ranges[overlap_at].clone());
|
||||
self.ranges.remove(overlap_at);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
removed_accum.to_keyspace()
|
||||
}
|
||||
|
||||
pub fn start(&self) -> Option<Key> {
|
||||
@@ -553,7 +561,16 @@ mod tests {
|
||||
Key::from_i128(11)..Key::from_i128(13),
|
||||
],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(2)..Key::from_i128(3),
|
||||
Key::from_i128(6)..Key::from_i128(7),
|
||||
Key::from_i128(11)..Key::from_i128(12),
|
||||
],
|
||||
};
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
@@ -583,7 +600,17 @@ mod tests {
|
||||
Key::from_i128(14)..Key::from_i128(17),
|
||||
],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(3)..Key::from_i128(5),
|
||||
Key::from_i128(8)..Key::from_i128(10),
|
||||
Key::from_i128(14)..Key::from_i128(15),
|
||||
],
|
||||
};
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
@@ -610,7 +637,11 @@ mod tests {
|
||||
Key::from_i128(15)..Key::from_i128(17),
|
||||
],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace::default();
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
@@ -637,7 +668,17 @@ mod tests {
|
||||
let key_space2 = KeySpace {
|
||||
ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
|
||||
};
|
||||
key_space1.remove_overlapping_with(&key_space2);
|
||||
|
||||
let removed = key_space1.remove_overlapping_with(&key_space2);
|
||||
let removed_expected = KeySpace {
|
||||
ranges: vec![
|
||||
Key::from_i128(9)..Key::from_i128(10),
|
||||
Key::from_i128(12)..Key::from_i128(15),
|
||||
Key::from_i128(17)..Key::from_i128(19),
|
||||
],
|
||||
};
|
||||
assert_eq!(removed, removed_expected);
|
||||
|
||||
assert_eq!(
|
||||
key_space1.ranges,
|
||||
vec![
|
||||
|
||||
@@ -429,6 +429,7 @@ pub struct StatusResponse {
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLocationConfigRequest {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub tenant_id: Option<TenantShardId>,
|
||||
#[serde(flatten)]
|
||||
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
@@ -747,10 +748,18 @@ pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerProcessStatus {
|
||||
pub pid: u32,
|
||||
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
|
||||
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
|
||||
pub kind: Cow<'static, str>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerStatus {
|
||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub pid: Option<u32>,
|
||||
pub process: Option<WalRedoManagerProcessStatus>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
|
||||
@@ -5,15 +5,93 @@ use crate::{
|
||||
models::ShardParameters,
|
||||
};
|
||||
use hex::FromHex;
|
||||
use postgres_ffi::relfile_utils::INIT_FORKNUM;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
||||
///
|
||||
/// This module contains a variety of types used to represent the concept of sharding
|
||||
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
||||
/// we provide an summary here.
|
||||
///
|
||||
/// Types used to describe shards:
|
||||
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
||||
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
||||
/// a shard suffix.
|
||||
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
||||
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
||||
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
||||
/// tenant, such as layer files.
|
||||
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
||||
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
||||
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
||||
/// four hex digits. An unsharded tenant is `0000`.
|
||||
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
||||
///
|
||||
/// Types used to describe the parameters for data distribution in a sharded tenant:
|
||||
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
||||
/// multiple shards. Its value is given in 8kiB pages.
|
||||
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
||||
/// always zero: this is provided for future upgrades that might introduce different
|
||||
/// data distribution schemes.
|
||||
///
|
||||
/// Examples:
|
||||
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
||||
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
||||
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
/// and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(u8);
|
||||
|
||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||
/// the fully qualified TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
||||
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
/// Formatting helper, for generating the `shard_id` label in traces.
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
||||
///
|
||||
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
||||
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
||||
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
||||
///
|
||||
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
@@ -38,6 +116,7 @@ impl ShardCount {
|
||||
self.0
|
||||
}
|
||||
|
||||
///
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
@@ -53,33 +132,6 @@ impl ShardNumber {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
}
|
||||
|
||||
/// TenantShardId identify the units of work for the Pageserver.
|
||||
///
|
||||
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
|
||||
///
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// Historically, tenants could not have multiple shards, and were identified
|
||||
/// by TenantId. To support this, TenantShardId has a special legacy
|
||||
/// mode where `shard_count` is equal to zero: this represents a single-sharded
|
||||
/// tenant which should be written as a TenantId with no suffix.
|
||||
///
|
||||
/// The human-readable encoding of TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
///
|
||||
/// Note that the binary encoding is _not_ backward compatible, because
|
||||
/// at the time sharding is introduced, there are no existing binary structures
|
||||
/// containing TenantId that we need to handle.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl TenantShardId {
|
||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||
Self {
|
||||
@@ -111,10 +163,13 @@ impl TenantShardId {
|
||||
}
|
||||
|
||||
/// Convenience for code that has special behavior on the 0th shard.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
}
|
||||
@@ -150,9 +205,6 @@ impl TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
@@ -222,16 +274,6 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// For use within the context of a particular tenant, when we need to know which
|
||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||
/// TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
@@ -246,6 +288,9 @@ impl ShardIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
@@ -313,6 +358,8 @@ impl Serialize for TenantShardId {
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
||||
// compatible, this binary encoding is not.
|
||||
let mut packed: [u8; 18] = [0; 18];
|
||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||
packed[16] = self.shard_number.0;
|
||||
@@ -390,16 +437,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
/// The ShardIdentity contains the information needed for one member of map
|
||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
pub enum ShardConfigError {
|
||||
#[error("Invalid shard count")]
|
||||
@@ -439,6 +476,9 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||
}
|
||||
@@ -487,6 +527,8 @@ impl ShardIdentity {
|
||||
}
|
||||
|
||||
/// Return true if the key should be ingested by this shard
|
||||
///
|
||||
/// Shards must ingest _at least_ keys which return true from this check.
|
||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||
assert!(!self.is_broken());
|
||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||
@@ -496,8 +538,28 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
|
||||
///
|
||||
/// When we fail to read a forknum block, this function tells us whether we may ignore the error
|
||||
/// as a symptom of that issue.
|
||||
pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
|
||||
if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut hash = murmurhash32(key.field4);
|
||||
hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
|
||||
let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
|
||||
|
||||
// The key may be affected by issue #7454: it is an initfork and it would not
|
||||
// have mapped to shard 0 until we fixed that issue.
|
||||
mapped_shard != ShardNumber(0)
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split
|
||||
/// data store, e.g. during compaction after a split.
|
||||
///
|
||||
/// Shards _may_ drop keys which return false here, but are not obliged to.
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
@@ -523,7 +585,7 @@ impl ShardIdentity {
|
||||
|
||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
self.number == ShardNumber(0)
|
||||
}
|
||||
}
|
||||
@@ -606,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
|
||||
// relation pages are distributed to shards other than shard zero. Everything else gets
|
||||
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
|
||||
// requests, and any request other than those for particular blocks in relations.
|
||||
!is_rel_block_key(key)
|
||||
//
|
||||
// The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
|
||||
// type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
|
||||
// because they must be included in basebackups.
|
||||
let is_initfork = key.field5 == INIT_FORKNUM;
|
||||
|
||||
!is_rel_block_key(key) || is_initfork
|
||||
}
|
||||
|
||||
/// Provide the same result as the function in postgres `hashfn.h` with the same name
|
||||
|
||||
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
|
||||
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
||||
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
||||
pub use v14::bindings::{PageHeaderData, XLogRecord};
|
||||
pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||
pub use v14::xlog_utils::{
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
};
|
||||
|
||||
pub use v14::bindings::{CheckPoint, ControlFileData};
|
||||
|
||||
|
||||
@@ -4,7 +4,9 @@ use log::*;
|
||||
use postgres::types::PgLsn;
|
||||
use postgres::Client;
|
||||
use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||
use postgres_ffi::{
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
|
||||
intermediate_lsns.insert(0, initial_lsn);
|
||||
}
|
||||
|
||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||
// Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
|
||||
//
|
||||
// Note: this is broken if pg_current_wal_insert_lsn is at page boundary
|
||||
// because pg_current_wal_insert_lsn skips page headers.
|
||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||
// If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
|
||||
// returns the position just after the page header on the next page. That's where the next
|
||||
// record will be inserted. But the page header hasn't actually been written to the WAL
|
||||
// yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
|
||||
// error. Because of that, if the insert location is just after a page header, back off to
|
||||
// previous page boundary.
|
||||
let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
|
||||
if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
|
||||
lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
|
||||
} else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
|
||||
lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||
}
|
||||
client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
|
||||
Ok(intermediate_lsns)
|
||||
}
|
||||
|
||||
@@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
|
||||
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
|
||||
// We will use logical message as the padding. We start with detecting how much WAL
|
||||
// it takes for one logical message, considering all alignments and headers.
|
||||
let base_wal_advance = {
|
||||
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary. We
|
||||
// will use carefully-sized logical messages to advance WAL insert location such
|
||||
// that there is just enough space on the page for the XLOG_SWITCH record.
|
||||
loop {
|
||||
// We start with measuring how much WAL it takes for one logical message,
|
||||
// considering all alignments and headers.
|
||||
let before_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
// Small non-empty message bigger than few bytes is more likely than an empty
|
||||
// message to have the same format as the big padding message.
|
||||
client.execute(
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
|
||||
&[],
|
||||
)?;
|
||||
// The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
||||
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
|
||||
+ XLOG_SIZE_OF_XLOG_RECORD
|
||||
};
|
||||
let mut remaining_lsn =
|
||||
XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
|
||||
if remaining_lsn < base_wal_advance {
|
||||
remaining_lsn += XLOG_BLCKSZ;
|
||||
let after_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
|
||||
// Did the record cross a page boundary? If it did, start over. Crossing a
|
||||
// page boundary adds to the apparent size of the record because of the page
|
||||
// header, which throws off the calculation.
|
||||
if u64::from(before_lsn) / XLOG_BLCKSZ as u64
|
||||
!= u64::from(after_lsn) / XLOG_BLCKSZ as u64
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// base_size is the size of a logical message without the payload
|
||||
let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
|
||||
|
||||
// Is there enough space on the page for another logical message and an
|
||||
// XLOG_SWITCH? If not, start over.
|
||||
let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
|
||||
if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// We will write another logical message, such that after the logical message
|
||||
// record, there will be space for exactly one XLOG_SWITCH. How large should
|
||||
// the logical message's payload be? An XLOG_SWITCH record has no data => its
|
||||
// size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
||||
let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
|
||||
|
||||
client.execute(
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||
&[&(repeats as i32)],
|
||||
)?;
|
||||
break;
|
||||
}
|
||||
let repeats = 10 + remaining_lsn - base_wal_advance;
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
remaining_lsn,
|
||||
base_wal_advance,
|
||||
repeats
|
||||
);
|
||||
client.execute(
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||
&[&(repeats as i32)],
|
||||
)?;
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
|
||||
@@ -134,6 +134,11 @@ impl RemotePath {
|
||||
pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
|
||||
self.0.strip_prefix(&p.0)
|
||||
}
|
||||
|
||||
pub fn add_trailing_slash(&self) -> Self {
|
||||
// Unwrap safety inputs are guararnteed to be valid UTF-8
|
||||
Self(format!("{}/", self.0).try_into().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
/// We don't need callers to be able to pass arbitrary delimiters: just control
|
||||
@@ -157,47 +162,21 @@ pub struct Listing {
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[allow(async_fn_in_trait)]
|
||||
pub trait RemoteStorage: Send + Sync + 'static {
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
/// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
|
||||
/// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
|
||||
/// so this method doesnt need to.
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::WithDelimiter, None, cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
Ok(result)
|
||||
}
|
||||
/// Lists all files in directory "recursively"
|
||||
/// (not really recursively, because AWS has a flat namespace)
|
||||
/// Note: This is subtely different than list_prefixes,
|
||||
/// because it is for listing files instead of listing
|
||||
/// names sharing common prefixes.
|
||||
/// For example,
|
||||
/// list_files("foo/bar") = ["foo/bar/cat123.txt",
|
||||
/// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
|
||||
/// whereas,
|
||||
/// list_prefixes("foo/bar/") = ["cat", "dog"]
|
||||
/// See `test_real_s3.rs` for more details.
|
||||
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
|
||||
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
|
||||
///
|
||||
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
|
||||
/// from the absolute root of the bucket.
|
||||
///
|
||||
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
|
||||
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
|
||||
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
|
||||
/// returned in `keys` ().
|
||||
///
|
||||
/// `max_keys` controls the maximum number of keys that will be returned. If this is None, this function
|
||||
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
|
||||
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
|
||||
///
|
||||
/// max_keys limits max number of keys returned; None means unlimited.
|
||||
async fn list_files(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
let result = self
|
||||
.list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
|
||||
.await?
|
||||
.keys;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
@@ -336,41 +315,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
|
||||
}
|
||||
}
|
||||
|
||||
// A function for listing all the files in a "directory"
|
||||
// Example:
|
||||
// list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
|
||||
//
|
||||
// max_keys limits max number of keys returned; None means unlimited.
|
||||
pub async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
// lists common *prefixes*, if any of files
|
||||
// Example:
|
||||
// list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
|
||||
pub async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
match self {
|
||||
Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
|
||||
Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
|
||||
}
|
||||
}
|
||||
|
||||
/// See [`RemoteStorage::upload`]
|
||||
pub async fn upload(
|
||||
&self,
|
||||
|
||||
@@ -5,11 +5,9 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
collections::HashSet,
|
||||
io::ErrorKind,
|
||||
num::NonZeroU32,
|
||||
pin::Pin,
|
||||
time::{Duration, SystemTime, UNIX_EPOCH},
|
||||
};
|
||||
|
||||
@@ -22,11 +20,11 @@ use tokio::{
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tokio_util::{io::ReaderStream, sync::CancellationToken};
|
||||
use tracing::*;
|
||||
use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
|
||||
use crate::{
|
||||
Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
|
||||
REMOTE_STORAGE_PREFIX_SEPARATOR,
|
||||
};
|
||||
|
||||
use super::{RemoteStorage, StorageMetadata};
|
||||
@@ -93,7 +91,47 @@ impl LocalFs {
|
||||
|
||||
#[cfg(test)]
|
||||
async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
|
||||
Ok(get_all_files(&self.storage_root, true)
|
||||
use std::{future::Future, pin::Pin};
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path =
|
||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||
anyhow::Error::msg(format!(
|
||||
"non-Unicode path: {}",
|
||||
pb.to_string_lossy()
|
||||
))
|
||||
})?;
|
||||
if file_type.is_symlink() {
|
||||
tracing::debug!("{entry_path:?} is a symlink, skipping")
|
||||
} else if file_type.is_dir() {
|
||||
paths.extend(get_all_files(&entry_path).await?.into_iter())
|
||||
} else {
|
||||
paths.push(entry_path);
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path {directory_path:?} is not a directory")
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Ok(get_all_files(&self.storage_root)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
@@ -120,6 +158,14 @@ impl LocalFs {
|
||||
// S3 object list prefixes can be arbitrary strings, but when reading
|
||||
// the local filesystem we need a directory to start calling read_dir on.
|
||||
let mut initial_dir = full_path.clone();
|
||||
|
||||
// If there's no trailing slash, we have to start looking from one above: even if
|
||||
// `initial_dir` is a directory, we should still list any prefixes in the parent
|
||||
// that start with the same string.
|
||||
if !full_path.to_string().ends_with('/') {
|
||||
initial_dir.pop();
|
||||
}
|
||||
|
||||
loop {
|
||||
// Did we make it to the root?
|
||||
if initial_dir.parent().is_none() {
|
||||
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
|
||||
let op = async {
|
||||
let mut result = Listing::default();
|
||||
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
result.keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
|
||||
let prefixes_to_filter = get_all_files(path.as_ref(), false)
|
||||
// Filter out directories: in S3 directories don't exist, only the keys within them do.
|
||||
let keys = self
|
||||
.list_recursive(prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
let keys = keys
|
||||
.into_iter()
|
||||
.filter(|k| {
|
||||
let path = k.with_base(&self.storage_root);
|
||||
!path.is_dir()
|
||||
})
|
||||
.collect();
|
||||
|
||||
// filter out empty directories to mirror s3 behavior.
|
||||
for prefix in prefixes_to_filter {
|
||||
if prefix.is_dir()
|
||||
&& is_directory_empty(&prefix)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
let stripped = prefix
|
||||
.strip_prefix(&self.storage_root)
|
||||
.context("Failed to strip prefix")
|
||||
.and_then(RemotePath::new)
|
||||
.expect(
|
||||
"We list files for storage root, hence should be able to remote the prefix",
|
||||
);
|
||||
|
||||
if prefix.is_dir() {
|
||||
result.prefixes.push(stripped);
|
||||
} else {
|
||||
result.keys.push(stripped);
|
||||
if let ListingMode::NoDelimiter = mode {
|
||||
result.keys = keys;
|
||||
} else {
|
||||
let mut prefixes = HashSet::new();
|
||||
for key in keys {
|
||||
// If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
|
||||
let relative_key = if let Some(prefix) = prefix {
|
||||
let mut prefix = prefix.clone();
|
||||
// We only strip the dirname of the prefix, so that when we strip it from the start of keys we
|
||||
// end up with full file/dir names.
|
||||
let prefix_full_local_path = prefix.with_base(&self.storage_root);
|
||||
let has_slash = prefix.0.to_string().ends_with('/');
|
||||
let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
|
||||
prefix
|
||||
} else {
|
||||
prefix.0.pop();
|
||||
prefix
|
||||
};
|
||||
|
||||
RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
|
||||
} else {
|
||||
key
|
||||
};
|
||||
|
||||
let relative_key = format!("{}", relative_key);
|
||||
if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
|
||||
let first_part = relative_key
|
||||
.split(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
.next()
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
prefixes.insert(first_part);
|
||||
} else {
|
||||
result
|
||||
.keys
|
||||
.push(RemotePath::from_string(&relative_key).unwrap());
|
||||
}
|
||||
}
|
||||
result.prefixes = prefixes
|
||||
.into_iter()
|
||||
.map(|s| RemotePath::from_string(&s).unwrap())
|
||||
.collect();
|
||||
}
|
||||
|
||||
if let Some(max_keys) = max_keys {
|
||||
result.keys.truncate(max_keys.get() as usize);
|
||||
}
|
||||
Ok(result)
|
||||
};
|
||||
|
||||
@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
|
||||
path_with_suffix_extension(original_path, "metadata")
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
recursive: bool,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Utf8Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path =
|
||||
Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
|
||||
anyhow::Error::msg(format!(
|
||||
"non-Unicode path: {}",
|
||||
pb.to_string_lossy()
|
||||
))
|
||||
})?;
|
||||
if file_type.is_symlink() {
|
||||
debug!("{entry_path:?} is a symlink, skipping")
|
||||
} else if file_type.is_dir() {
|
||||
if recursive {
|
||||
paths.extend(get_all_files(&entry_path, true).await?.into_iter())
|
||||
} else {
|
||||
paths.push(entry_path)
|
||||
}
|
||||
} else {
|
||||
paths.push(entry_path);
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path {directory_path:?} is not a directory")
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
@@ -923,13 +930,18 @@ mod fs_tests {
|
||||
// No delimiter: should recursively list everything
|
||||
let (storage, cancel) = create_storage()?;
|
||||
let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
|
||||
let child_sibling =
|
||||
upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
|
||||
let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
|
||||
|
||||
let listing = storage
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.await?;
|
||||
assert!(listing.prefixes.is_empty());
|
||||
assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
|
||||
assert_eq!(
|
||||
listing.keys.into_iter().collect::<HashSet<_>>(),
|
||||
HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
|
||||
);
|
||||
|
||||
// Delimiter: should only go one deep
|
||||
let listing = storage
|
||||
@@ -942,7 +954,25 @@ mod fs_tests {
|
||||
);
|
||||
assert!(listing.keys.is_empty());
|
||||
|
||||
// Delimiter & prefix
|
||||
// Delimiter & prefix with a trailing slash
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(
|
||||
listing.keys,
|
||||
[RemotePath::from_string("uncle").unwrap()].to_vec()
|
||||
);
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("parent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
// Delimiter and prefix without a trailing slash
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
|
||||
@@ -951,12 +981,66 @@ mod fs_tests {
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
|
||||
.to_vec()
|
||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
// Delimiter and prefix that's partway through a path component
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
assert_eq!(
|
||||
listing.prefixes,
|
||||
[RemotePath::from_string("grandparent").unwrap()].to_vec()
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_part_component() -> anyhow::Result<()> {
|
||||
// No delimiter: should recursively list everything
|
||||
let (storage, cancel) = create_storage()?;
|
||||
|
||||
// Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
|
||||
// of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
|
||||
// a freeform prefix.
|
||||
let _child_a =
|
||||
upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
|
||||
let _child_b =
|
||||
upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
|
||||
|
||||
// Delimiter and prefix that's partway through a path component
|
||||
let listing = storage
|
||||
.list(
|
||||
Some(
|
||||
&RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
|
||||
),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?;
|
||||
assert_eq!(listing.keys, [].to_vec());
|
||||
|
||||
let mut found_prefixes = listing.prefixes.clone();
|
||||
found_prefixes.sort();
|
||||
assert_eq!(
|
||||
found_prefixes,
|
||||
[
|
||||
RemotePath::from_string("tenant").unwrap(),
|
||||
RemotePath::from_string("tenant-01").unwrap(),
|
||||
]
|
||||
.to_vec()
|
||||
);
|
||||
assert_eq!(listing.keys, [uncle.clone()].to_vec());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -178,10 +178,7 @@ impl S3Bucket {
|
||||
|
||||
pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
|
||||
assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
let path_string = path
|
||||
.get_path()
|
||||
.as_str()
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
let path_string = path.get_path().as_str();
|
||||
match &self.prefix_in_bucket {
|
||||
Some(prefix) => prefix.clone() + "/" + path_string,
|
||||
None => path_string.to_string(),
|
||||
@@ -471,16 +468,11 @@ impl RemoteStorage for S3Bucket {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| self.relative_path_to_s3_object(p))
|
||||
.or_else(|| self.prefix_in_bucket.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if matches!(mode, ListingMode::WithDelimiter)
|
||||
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
|
||||
{
|
||||
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
}
|
||||
p
|
||||
.or_else(|| {
|
||||
self.prefix_in_bucket.clone().map(|mut s| {
|
||||
s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
|
||||
s
|
||||
})
|
||||
});
|
||||
|
||||
let _permit = self.permit(kind, cancel).await?;
|
||||
@@ -549,11 +541,15 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
}
|
||||
|
||||
result.prefixes.extend(
|
||||
prefixes
|
||||
.iter()
|
||||
.filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
|
||||
);
|
||||
// S3 gives us prefixes like "foo/", we return them like "foo"
|
||||
result.prefixes.extend(prefixes.iter().filter_map(|o| {
|
||||
Some(
|
||||
self.s3_object_to_relative_path(
|
||||
o.prefix()?
|
||||
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
|
||||
),
|
||||
)
|
||||
}));
|
||||
|
||||
continuation_token = match response.next_continuation_token {
|
||||
Some(new_token) => Some(new_token),
|
||||
@@ -1050,22 +1046,22 @@ mod tests {
|
||||
Some("/test/prefix/"),
|
||||
];
|
||||
let expected_outputs = [
|
||||
vec!["", "some/path", "some/path"],
|
||||
vec!["/", "/some/path", "/some/path"],
|
||||
vec!["", "some/path", "some/path/"],
|
||||
vec!["/", "/some/path", "/some/path/"],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
],
|
||||
vec![
|
||||
"test/prefix/",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path",
|
||||
"test/prefix/some/path/",
|
||||
],
|
||||
];
|
||||
|
||||
|
||||
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
|
||||
type VoidStorage = crate::LocalFs;
|
||||
|
||||
impl RemoteStorage for UnreliableWrapper {
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_prefixes(prefix, cancel).await
|
||||
}
|
||||
|
||||
async fn list_files(
|
||||
&self,
|
||||
folder: Option<&RemotePath>,
|
||||
max_keys: Option<NonZeroU32>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Vec<RemotePath>, DownloadError> {
|
||||
self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
|
||||
.map_err(DownloadError::Other)?;
|
||||
self.inner.list_files(folder, max_keys, cancel).await
|
||||
}
|
||||
|
||||
async fn list(
|
||||
&self,
|
||||
prefix: Option<&RemotePath>,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use remote_storage::ListingMode;
|
||||
use remote_storage::RemotePath;
|
||||
use std::sync::Arc;
|
||||
use std::{collections::HashSet, num::NonZeroU32};
|
||||
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
|
||||
.context("common_prefix construction")?;
|
||||
let root_remote_prefixes = test_client
|
||||
.list_prefixes(None, &cancel)
|
||||
.await
|
||||
.context("client list root prefixes failure")?
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?
|
||||
.prefixes
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
);
|
||||
|
||||
let nested_remote_prefixes = test_client
|
||||
.list_prefixes(Some(&base_prefix), &cancel)
|
||||
.await
|
||||
.context("client list nested prefixes failure")?
|
||||
.list(
|
||||
Some(&base_prefix.add_trailing_slash()),
|
||||
ListingMode::WithDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await?
|
||||
.prefixes
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let remote_only_prefixes = nested_remote_prefixes
|
||||
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
|
||||
///
|
||||
/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
|
||||
/// Then performs the following queries:
|
||||
/// 1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list_files("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
/// 1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
|
||||
/// 2. `list("folder1")`. This should return all files `random_prefix/folder1/blob_{i}.txt`
|
||||
#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
|
||||
#[tokio::test]
|
||||
async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
|
||||
async fn list_no_delimiter_works(
|
||||
ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
|
||||
) -> anyhow::Result<()> {
|
||||
let ctx = match ctx {
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
|
||||
MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
|
||||
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
||||
let base_prefix =
|
||||
RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
|
||||
let root_files = test_client
|
||||
.list_files(None, None, &cancel)
|
||||
.list(None, ListingMode::NoDelimiter, None, &cancel)
|
||||
.await
|
||||
.context("client list root files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(
|
||||
root_files,
|
||||
ctx.remote_blobs.clone(),
|
||||
"remote storage list_files on root mismatches with the uploads."
|
||||
"remote storage list on root mismatches with the uploads."
|
||||
);
|
||||
|
||||
// Test that max_keys limit works. In total there are about 21 files (see
|
||||
// upload_simple_remote_data call in test_real_s3.rs).
|
||||
let limited_root_files = test_client
|
||||
.list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
|
||||
.list(
|
||||
None,
|
||||
ListingMode::NoDelimiter,
|
||||
Some(NonZeroU32::new(2).unwrap()),
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.context("client list root files failure")?;
|
||||
assert_eq!(limited_root_files.len(), 2);
|
||||
assert_eq!(limited_root_files.keys.len(), 2);
|
||||
|
||||
let nested_remote_files = test_client
|
||||
.list_files(Some(&base_prefix), None, &cancel)
|
||||
.list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
|
||||
.await
|
||||
.context("client list nested files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>();
|
||||
let trim_remote_blobs: HashSet<_> = ctx
|
||||
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
|
||||
.collect();
|
||||
assert_eq!(
|
||||
nested_remote_files, trim_remote_blobs,
|
||||
"remote storage list_files on subdirrectory mismatches with the uploads."
|
||||
"remote storage list on subdirrectory mismatches with the uploads."
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
|
||||
|
||||
ctx.client.delete_objects(&[path1, path2], &cancel).await?;
|
||||
|
||||
let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
|
||||
let prefixes = ctx
|
||||
.client
|
||||
.list(None, ListingMode::WithDelimiter, None, &cancel)
|
||||
.await?
|
||||
.prefixes;
|
||||
|
||||
assert_eq!(prefixes.len(), 1);
|
||||
|
||||
|
||||
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
Enabled(AzureWithSimpleTestBlobs),
|
||||
Disabled,
|
||||
|
||||
@@ -12,8 +12,8 @@ use anyhow::Context;
|
||||
use camino::Utf8Path;
|
||||
use futures_util::StreamExt;
|
||||
use remote_storage::{
|
||||
DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
|
||||
S3Config,
|
||||
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
|
||||
RemoteStorageKind, S3Config,
|
||||
};
|
||||
use test_context::test_context;
|
||||
use test_context::AsyncTestContext;
|
||||
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
|
||||
client: &Arc<GenericRemoteStorage>,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<HashSet<RemotePath>> {
|
||||
Ok(retry(|| client.list_files(None, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>())
|
||||
Ok(
|
||||
retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
|
||||
.await
|
||||
.context("list root files failure")?
|
||||
.keys
|
||||
.into_iter()
|
||||
.collect::<HashSet<_>>(),
|
||||
)
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: the setups for the list_prefixes test and the list_files test are very similar
|
||||
// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
|
||||
// whereas the list_files function is concerned with listing files.
|
||||
// See `RemoteStorage::list_files` documentation for more details
|
||||
enum MaybeEnabledStorageWithSimpleTestBlobs {
|
||||
Enabled(S3WithSimpleTestBlobs),
|
||||
Disabled,
|
||||
|
||||
@@ -92,6 +92,8 @@ pub mod zstd;
|
||||
|
||||
pub mod env;
|
||||
|
||||
pub mod poison;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
121
libs/utils/src/poison.rs
Normal file
121
libs/utils/src/poison.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
//! Protect a piece of state from reuse after it is left in an inconsistent state.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! # tokio_test::block_on(async {
|
||||
//! use utils::poison::Poison;
|
||||
//! use std::time::Duration;
|
||||
//!
|
||||
//! struct State {
|
||||
//! clean: bool,
|
||||
//! }
|
||||
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
|
||||
//!
|
||||
//! let mut mutex_guard = state.lock().await;
|
||||
//! let mut poison_guard = mutex_guard.check_and_arm()?;
|
||||
//! let state = poison_guard.data_mut();
|
||||
//! state.clean = false;
|
||||
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
|
||||
//! tokio::time::sleep(Duration::from_secs(10)).await;
|
||||
//! state.clean = true;
|
||||
//! poison_guard.disarm();
|
||||
//! # Ok::<(), utils::poison::Error>(())
|
||||
//! # });
|
||||
//! ```
|
||||
|
||||
use tracing::warn;
|
||||
|
||||
pub struct Poison<T> {
|
||||
what: &'static str,
|
||||
state: State,
|
||||
data: T,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum State {
|
||||
Clean,
|
||||
Armed,
|
||||
Poisoned { at: chrono::DateTime<chrono::Utc> },
|
||||
}
|
||||
|
||||
impl<T> Poison<T> {
|
||||
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
|
||||
pub fn new(what: &'static str, data: T) -> Self {
|
||||
Self {
|
||||
what,
|
||||
state: State::Clean,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
|
||||
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
|
||||
match self.state {
|
||||
State::Clean => {
|
||||
self.state = State::Armed;
|
||||
Ok(Guard(self))
|
||||
}
|
||||
State::Armed => unreachable!("transient state"),
|
||||
State::Poisoned { at } => Err(Error::Poisoned {
|
||||
what: self.what,
|
||||
at,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
||||
/// Once modifications are done, use [`Self::disarm`].
|
||||
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
||||
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
|
||||
pub struct Guard<'a, T>(&'a mut Poison<T>);
|
||||
|
||||
impl<'a, T> Guard<'a, T> {
|
||||
pub fn data(&self) -> &T {
|
||||
&self.0.data
|
||||
}
|
||||
pub fn data_mut(&mut self) -> &mut T {
|
||||
&mut self.0.data
|
||||
}
|
||||
|
||||
pub fn disarm(self) {
|
||||
match self.0.state {
|
||||
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
|
||||
State::Armed => {
|
||||
self.0.state = State::Clean;
|
||||
}
|
||||
State::Poisoned { at } => {
|
||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Drop for Guard<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
match self.0.state {
|
||||
State::Clean => {
|
||||
// set by disarm()
|
||||
}
|
||||
State::Armed => {
|
||||
// still armed => poison it
|
||||
let at = chrono::Utc::now();
|
||||
self.0.state = State::Poisoned { at };
|
||||
warn!(at=?at, "poisoning {}", self.0.what);
|
||||
}
|
||||
State::Poisoned { at } => {
|
||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("poisoned at {at}: {what}")]
|
||||
Poisoned {
|
||||
what: &'static str,
|
||||
at: chrono::DateTime<chrono::Utc>,
|
||||
},
|
||||
}
|
||||
@@ -192,6 +192,14 @@ impl<T> OnceCell<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
|
||||
/// initialized.
|
||||
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
||||
let inner = self.inner.get_mut().unwrap();
|
||||
|
||||
inner.take_and_deinit()
|
||||
}
|
||||
|
||||
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
|
||||
pub fn initializer_count(&self) -> usize {
|
||||
self.initializers.load(Ordering::Relaxed)
|
||||
@@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> {
|
||||
/// The permit will be on a semaphore part of the new internal value, and any following
|
||||
/// [`OnceCell::get_or_init`] will wait on it to complete.
|
||||
pub fn take_and_deinit(mut self) -> (T, InitPermit) {
|
||||
self.0
|
||||
.take_and_deinit()
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Inner<T> {
|
||||
pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
|
||||
let value = self.value.take()?;
|
||||
|
||||
let mut swapped = Inner::default();
|
||||
let sem = swapped.init_semaphore.clone();
|
||||
// acquire and forget right away, moving the control over to InitPermit
|
||||
sem.try_acquire().expect("we just created this").forget();
|
||||
std::mem::swap(&mut *self.0, &mut swapped);
|
||||
swapped
|
||||
.value
|
||||
.map(|v| (v, InitPermit(sem)))
|
||||
.expect("guard is not created unless value has been initialized")
|
||||
let permit = InitPermit(sem);
|
||||
std::mem::swap(self, &mut swapped);
|
||||
Some((value, permit))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> {
|
||||
/// On drop, this type will return the permit.
|
||||
pub struct InitPermit(Arc<tokio::sync::Semaphore>);
|
||||
|
||||
impl std::fmt::Debug for InitPermit {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let ptr = Arc::as_ptr(&self.0) as *const ();
|
||||
f.debug_tuple("InitPermit").field(&ptr).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for InitPermit {
|
||||
fn drop(&mut self) {
|
||||
assert_eq!(
|
||||
@@ -559,4 +582,22 @@ mod tests {
|
||||
|
||||
assert_eq!(*target.get().unwrap(), 11);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn take_and_deinit_on_mut() {
|
||||
use std::convert::Infallible;
|
||||
|
||||
let mut target = OnceCell::<u32>::default();
|
||||
assert!(target.take_and_deinit().is_none());
|
||||
|
||||
target
|
||||
.get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let again = target.take_and_deinit();
|
||||
assert!(matches!(again, Some((42, _))), "{again:?}");
|
||||
|
||||
assert!(target.take_and_deinit().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
|
||||
tokio-util.workspace = true
|
||||
toml_edit = { workspace = true, features = [ "serde" ] }
|
||||
tracing.workspace = true
|
||||
twox-hash.workspace = true
|
||||
url.workspace = true
|
||||
walkdir.workspace = true
|
||||
metrics.workspace = true
|
||||
|
||||
@@ -27,30 +27,50 @@
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-04-04 on i3en.3xlarge
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [25.925 µs 26.060 µs 26.209 µs]
|
||||
//! short/2 time: [31.277 µs 31.483 µs 31.722 µs]
|
||||
//! short/4 time: [45.496 µs 45.831 µs 46.182 µs]
|
||||
//! short/8 time: [84.298 µs 84.920 µs 85.566 µs]
|
||||
//! short/16 time: [185.04 µs 186.41 µs 187.88 µs]
|
||||
//! short/32 time: [385.01 µs 386.77 µs 388.70 µs]
|
||||
//! short/64 time: [770.24 µs 773.04 µs 776.04 µs]
|
||||
//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms]
|
||||
//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs]
|
||||
//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs]
|
||||
//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs]
|
||||
//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs]
|
||||
//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms]
|
||||
//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms]
|
||||
//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms]
|
||||
//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms]
|
||||
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
|
||||
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
|
||||
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
|
||||
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
|
||||
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
|
||||
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
|
||||
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
|
||||
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
|
||||
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
|
||||
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
|
||||
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
|
||||
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
|
||||
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
|
||||
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
|
||||
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
|
||||
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
|
||||
//! ```
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::{PostgresRedoManager, ProcessKind},
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-short"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
fn bench_impl(
|
||||
process_kind: ProcessKind,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
nclients: u64,
|
||||
) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
conf.walredo_process_kind = process_kind;
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
// divide the amount of work equally among the clients.
|
||||
let nredos_per_client = n_redos / nclients;
|
||||
for _ in 0..nclients {
|
||||
rt.block_on(async {
|
||||
tasks.spawn(client(
|
||||
Arc::clone(&manager),
|
||||
Arc::clone(&start),
|
||||
Arc::clone(&redo_work),
|
||||
// divide the amount of work equally among the clients
|
||||
n_redos / nclients,
|
||||
nredos_per_client,
|
||||
))
|
||||
});
|
||||
}
|
||||
|
||||
rt.block_on(async move {
|
||||
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
||||
let elapsed = rt.block_on(async move {
|
||||
let mut total_wallclock_time = Duration::ZERO;
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
total_wallclock_time += res.unwrap();
|
||||
}
|
||||
total_wallclock_time
|
||||
})
|
||||
});
|
||||
|
||||
// consistency check to ensure process kind setting worked
|
||||
if nredos_per_client > 0 {
|
||||
assert_eq!(
|
||||
manager
|
||||
.status()
|
||||
.process
|
||||
.map(|p| p.kind)
|
||||
.expect("the benchmark work causes a walredo process to be spawned"),
|
||||
std::borrow::Cow::Borrowed(process_kind.into())
|
||||
);
|
||||
}
|
||||
|
||||
elapsed
|
||||
}
|
||||
|
||||
async fn client(
|
||||
|
||||
@@ -279,7 +279,7 @@ impl Client {
|
||||
lazy: bool,
|
||||
) -> Result<()> {
|
||||
let req_body = TenantLocationConfigRequest {
|
||||
tenant_id: Some(tenant_shard_id),
|
||||
tenant_id: None,
|
||||
config,
|
||||
};
|
||||
|
||||
|
||||
@@ -9,18 +9,45 @@
|
||||
//! Coordinates in both axis are compressed for better readability.
|
||||
//! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
|
||||
//!
|
||||
//! Example use:
|
||||
//! The plain text API was chosen so that we can easily work with filenames from various
|
||||
//! sources; see the Usage section below for examples.
|
||||
//!
|
||||
//! # Usage
|
||||
//!
|
||||
//! ## Producing the SVG
|
||||
//!
|
||||
//! ```bash
|
||||
//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! $ grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//! $ firefox out.svg
|
||||
//!
|
||||
//! # local timeline dir
|
||||
//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
|
||||
//! grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
|
||||
//!
|
||||
//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
|
||||
//! (jq -r '.historic_layers[] | .layer_file_name' | cargo run -p pagectl draw-timeline) < layer-map.json > out.svg
|
||||
//!
|
||||
//! # From an `index_part.json` in S3
|
||||
//! (jq -r '.layer_metadata | keys[]' | cargo run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
|
||||
//!
|
||||
//! ```
|
||||
//!
|
||||
//! This API was chosen so that we can easily work with filenames extracted from ssh,
|
||||
//! or from pageserver log files.
|
||||
//! ## Viewing
|
||||
//!
|
||||
//! TODO Consider shipping this as a grafana panel plugin:
|
||||
//! <https://grafana.com/tutorials/build-a-panel-plugin/>
|
||||
//! **Inkscape** is better than the built-in viewers in browsers.
|
||||
//!
|
||||
//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
|
||||
//! to see the layer file name in the comment field.
|
||||
//!
|
||||
//! ```bash
|
||||
//!
|
||||
//! # Linux
|
||||
//! inkscape out.svg
|
||||
//!
|
||||
//! # macOS
|
||||
//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
|
||||
//!
|
||||
//! ```
|
||||
//!
|
||||
|
||||
use anyhow::Result;
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::METADATA_FILE_NAME;
|
||||
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
|
||||
|
||||
pub fn main() -> Result<()> {
|
||||
// Parse layer filenames from stdin
|
||||
let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
|
||||
struct Layer {
|
||||
filename: String,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
let mut files: Vec<Layer> = vec![];
|
||||
let stdin = io::stdin();
|
||||
for line in stdin.lock().lines() {
|
||||
let line = line.unwrap();
|
||||
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
|
||||
// Don't try and parse "metadata" like a key-lsn range
|
||||
continue;
|
||||
}
|
||||
let range = parse_filename(filename);
|
||||
ranges.push(range);
|
||||
let (key_range, lsn_range) = parse_filename(filename);
|
||||
files.push(Layer {
|
||||
filename: filename.to_owned(),
|
||||
key_range,
|
||||
lsn_range,
|
||||
});
|
||||
}
|
||||
|
||||
// Collect all coordinates
|
||||
let mut keys: Vec<Key> = vec![];
|
||||
let mut lsns: Vec<Lsn> = vec![];
|
||||
for (keyr, lsnr) in &ranges {
|
||||
for Layer {
|
||||
key_range: keyr,
|
||||
lsn_range: lsnr,
|
||||
..
|
||||
} in &files
|
||||
{
|
||||
keys.push(keyr.start);
|
||||
keys.push(keyr.end);
|
||||
lsns.push(lsnr.start);
|
||||
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
|
||||
h: stretch * lsn_map.len() as f32
|
||||
}
|
||||
);
|
||||
for (keyr, lsnr) in &ranges {
|
||||
for Layer {
|
||||
filename,
|
||||
key_range: keyr,
|
||||
lsn_range: lsnr,
|
||||
} in &files
|
||||
{
|
||||
let key_start = *key_map.get(&keyr.start).unwrap();
|
||||
let key_end = *key_map.get(&keyr.end).unwrap();
|
||||
let key_diff = key_end - key_start;
|
||||
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
|
||||
.fill(fill)
|
||||
.stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
|
||||
.border_radius(0.4)
|
||||
.comment(filename)
|
||||
);
|
||||
}
|
||||
println!("{}", EndSvg);
|
||||
|
||||
112
pageserver/src/aux_file.rs
Normal file
112
pageserver/src/aux_file.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
|
||||
use tracing::warn;
|
||||
|
||||
/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
|
||||
fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
|
||||
let mut key = [0; METADATA_KEY_SIZE];
|
||||
let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
|
||||
key[0] = AUX_KEY_PREFIX;
|
||||
key[1] = dir_level1;
|
||||
key[2] = dir_level2;
|
||||
key[3..16].copy_from_slice(&hash[0..13]);
|
||||
Key::from_metadata_key_fixed_size(&key)
|
||||
}
|
||||
|
||||
const AUX_DIR_PG_LOGICAL: u8 = 0x01;
|
||||
const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
|
||||
const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
|
||||
|
||||
/// Encode the aux file into a fixed-size key.
|
||||
///
|
||||
/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
|
||||
/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
|
||||
/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
|
||||
/// is roughly based on the first two components of the path, one unique number for one component.
|
||||
///
|
||||
/// * pg_logical/mappings -> 0x0101
|
||||
/// * pg_logical/snapshots -> 0x0102
|
||||
/// * pg_logical/replorigin_checkpoint -> 0x0103
|
||||
/// * pg_logical/others -> 0x01FF
|
||||
/// * pg_replslot/ -> 0x0201
|
||||
/// * others -> 0xFFFF
|
||||
///
|
||||
/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
|
||||
/// The new file type must have never been written to the storage before. Otherwise, there could be data
|
||||
/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
|
||||
pub fn encode_aux_file_key(path: &str) -> Key {
|
||||
if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
|
||||
} else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
|
||||
} else if path == "pg_logical/replorigin_checkpoint" {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
|
||||
} else if let Some(fname) = path.strip_prefix("pg_logical/") {
|
||||
if cfg!(debug_assertions) {
|
||||
warn!(
|
||||
"unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
|
||||
path
|
||||
);
|
||||
}
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
|
||||
} else if let Some(fname) = path.strip_prefix("pg_replslot/") {
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
|
||||
} else {
|
||||
if cfg!(debug_assertions) {
|
||||
warn!(
|
||||
"unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
|
||||
path
|
||||
);
|
||||
}
|
||||
aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_hash_portable() {
|
||||
// AUX file encoding requires the hash to be portable across all platforms. This test case checks
|
||||
// if the algorithm produces the same hash across different environments.
|
||||
assert_eq!(
|
||||
305317690835051308206966631765527126151,
|
||||
twox_hash::xxh3::hash128("test1".as_bytes())
|
||||
);
|
||||
assert_eq!(
|
||||
85104974691013376326742244813280798847,
|
||||
twox_hash::xxh3::hash128("test/test2".as_bytes())
|
||||
);
|
||||
assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encoding_portable() {
|
||||
// To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
|
||||
// of the page server.
|
||||
assert_eq!(
|
||||
"8200000101E5B20C5F8DD5AA3289D6D9EAFA",
|
||||
encode_aux_file_key("pg_logical/mappings/test1").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"820000010239AAC544893139B26F501B97E6",
|
||||
encode_aux_file_key("pg_logical/snapshots/test2").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"820000010300000000000000000000000000",
|
||||
encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"82000001FF8635AF2134B7266EC5B4189FD6",
|
||||
encode_aux_file_key("pg_logical/unsupported").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"8200000201772D0E5D71DE14DA86142A1619",
|
||||
encode_aux_file_key("pg_replslot/test3").to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
"820000FFFF1866EBEB53B807B26A2416F317",
|
||||
encode_aux_file_key("other_file_not_supported").to_string()
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,7 @@
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use fail::fail_point;
|
||||
use pageserver_api::key::{key_to_slru_block, Key};
|
||||
use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::time::SystemTime;
|
||||
@@ -297,7 +297,20 @@ where
|
||||
if rel.forknum == INIT_FORKNUM {
|
||||
// I doubt we need _init fork itself, but having it at least
|
||||
// serves as a marker relation is unlogged.
|
||||
self.add_rel(rel, rel).await?;
|
||||
if let Err(_e) = self.add_rel(rel, rel).await {
|
||||
if self
|
||||
.timeline
|
||||
.get_shard_identity()
|
||||
.is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
|
||||
{
|
||||
// Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
|
||||
// whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup. This allows
|
||||
// postgres to start up. The relation won't work, but it will be possible to DROP TABLE on it and
|
||||
// recreate.
|
||||
tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -285,6 +285,7 @@ fn start_pageserver(
|
||||
))
|
||||
.unwrap();
|
||||
pageserver::preinitialize_metrics();
|
||||
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
|
||||
@@ -97,6 +97,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -140,6 +142,8 @@ pub mod defaults {
|
||||
|
||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||
|
||||
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -290,6 +294,8 @@ pub struct PageServerConf {
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub walredo_process_kind: crate::walredo::ProcessKind,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
|
||||
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
|
||||
self.walredo_process_kind = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
walredo_process_kind,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1032,6 +1047,9 @@ impl PageServerConf {
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
"walredo_process_kind" => {
|
||||
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1114,6 +1132,7 @@ impl PageServerConf {
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// We only send consumption metrics from shard 0, so don't waste time calculating
|
||||
// synthetic size on other shards.
|
||||
continue;
|
||||
|
||||
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
|
||||
};
|
||||
|
||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
|
||||
if state != TenantState::Active || !id.is_zero() {
|
||||
if state != TenantState::Active || !id.is_shard_zero() {
|
||||
None
|
||||
} else {
|
||||
tenant_manager
|
||||
|
||||
@@ -58,24 +58,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: The reload completed successfully.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error (also hits if no keys were found)
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}:
|
||||
parameters:
|
||||
@@ -93,62 +75,14 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
description: Tenant not found. This is the success path.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -165,18 +99,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
||||
parameters:
|
||||
@@ -206,36 +128,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
@@ -255,36 +147,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
@@ -309,60 +171,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found
|
||||
description: Timeline not found. This is the success path.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -379,18 +193,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
|
||||
parameters:
|
||||
@@ -423,36 +225,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found, or there is no timestamp information for the given lsn
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||
parameters:
|
||||
@@ -484,36 +256,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
@@ -537,36 +279,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
@@ -628,24 +340,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||
"503":
|
||||
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"409":
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
@@ -662,12 +356,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -684,36 +372,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant ignored
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
@@ -740,36 +398,6 @@ paths:
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
@@ -790,37 +418,6 @@ paths:
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"404":
|
||||
description: No tenant or timeline found for the specified ids
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
@@ -839,31 +436,8 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SyntheticSizeResponse"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
# This route has no handler. TODO: remove?
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -945,18 +519,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||
parameters:
|
||||
@@ -987,20 +549,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -1043,24 +591,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Malformed timeline create request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"406":
|
||||
description: Permanently unsatisfiable request, don't retry.
|
||||
content:
|
||||
@@ -1079,18 +609,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/:
|
||||
get:
|
||||
@@ -1104,30 +622,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
post:
|
||||
description: |
|
||||
@@ -1148,43 +642,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Malformed tenant create request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"409":
|
||||
description: Tenant already exists, creation skipped
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
@@ -1206,36 +669,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Malformed tenant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
parameters:
|
||||
@@ -1255,42 +688,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigResponse"
|
||||
"400":
|
||||
description: Malformed get tenanant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenand or timeline were not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/utilization:
|
||||
get:
|
||||
@@ -1304,12 +701,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PageserverUtilization"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
|
||||
fn from(pre: PageReconstructError) -> ApiError {
|
||||
match pre {
|
||||
PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
|
||||
PageReconstructError::MissingKey(e) => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("{e}"))
|
||||
}
|
||||
PageReconstructError::Cancelled => {
|
||||
ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
|
||||
}
|
||||
@@ -457,8 +460,12 @@ async fn reload_auth_validation_keys_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
Err(e) => {
|
||||
let err_msg = "Error reloading public keys";
|
||||
warn!("Error reloading public keys from {key_path:?}: {e:}");
|
||||
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
|
||||
json_response(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
HttpErrorBody::from_msg(err_msg.to_string()),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -696,7 +703,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
@@ -747,7 +754,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
@@ -772,7 +779,9 @@ async fn get_timestamp_of_lsn_handler(
|
||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => json_response(StatusCode::NOT_FOUND, ()),
|
||||
None => Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1086,7 +1095,7 @@ async fn tenant_size_handler(
|
||||
let headers = request.headers();
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
)));
|
||||
|
||||
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub use pageserver_api::keyspace;
|
||||
pub mod aux_file;
|
||||
pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
|
||||
@@ -86,11 +86,20 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
|
||||
pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_read_num_fs_layers",
|
||||
"Number of persistent layers accessed for processing a read request, including those in the cache",
|
||||
vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
|
||||
"pageserver_layers_visited_per_read_global",
|
||||
"Number of layers visited to reconstruct one key",
|
||||
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_layers_visited_per_vectored_read_global",
|
||||
"Average number of layers visited to reconstruct one key",
|
||||
vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -1518,7 +1527,8 @@ pub(crate) struct SecondaryModeMetrics {
|
||||
pub(crate) download_heatmap: IntCounter,
|
||||
pub(crate) download_layer: IntCounter,
|
||||
}
|
||||
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
|
||||
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
|
||||
SecondaryModeMetrics {
|
||||
upload_heatmap: register_int_counter!(
|
||||
"pageserver_secondary_upload_heatmap",
|
||||
"Number of heatmaps written to remote storage by attached tenants"
|
||||
@@ -1536,7 +1546,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
||||
.expect("failed to define a metric"),
|
||||
download_heatmap: register_int_counter!(
|
||||
"pageserver_secondary_download_heatmap",
|
||||
"Number of downloads of heatmaps by secondary mode locations"
|
||||
"Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
download_layer: register_int_counter!(
|
||||
@@ -1544,6 +1554,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
|
||||
"Number of downloads of layers by secondary mode locations"
|
||||
)
|
||||
.expect("failed to define a metric"),
|
||||
}
|
||||
});
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
@@ -1819,6 +1830,29 @@ impl Default for WalRedoProcessCounters {
|
||||
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
Lazy::new(WalRedoProcessCounters::default);
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub mod wal_redo {
|
||||
use super::*;
|
||||
|
||||
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
|
||||
std::sync::Mutex::new(
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_wal_redo_process_kind",
|
||||
"The configured process kind for walredo",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
|
||||
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
|
||||
// use guard to avoid races around the next two steps
|
||||
let guard = PROCESS_KIND.lock().unwrap();
|
||||
guard.reset();
|
||||
guard.with_label_values(&[&format!("{kind}")]).set(1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
@@ -2089,7 +2123,7 @@ impl TimelineMetrics {
|
||||
|
||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
// Only shard zero deals in synthetic sizes
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
let tid = tenant_shard_id.tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
}
|
||||
@@ -2746,7 +2780,8 @@ pub fn preinitialize_metrics() {
|
||||
|
||||
// histograms
|
||||
[
|
||||
&READ_NUM_FS_LAYERS,
|
||||
&READ_NUM_LAYERS_VISITED,
|
||||
&VEC_READ_NUM_LAYERS_VISITED,
|
||||
&WAIT_LSN_TIME,
|
||||
&WAL_REDO_TIME,
|
||||
&WAL_REDO_RECORDS_HISTOGRAM,
|
||||
|
||||
@@ -874,6 +874,11 @@ impl PageServerHandler {
|
||||
// walsender completes the authentication and starts streaming the
|
||||
// WAL.
|
||||
if lsn <= last_record_lsn {
|
||||
// It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
|
||||
// last_record_lsn. That would give the same result, since we know
|
||||
// that there haven't been modifications since 'lsn'. Using an older
|
||||
// LSN might be faster, because that could allow skipping recent
|
||||
// layers when finding the page.
|
||||
lsn = last_record_lsn;
|
||||
} else {
|
||||
timeline
|
||||
@@ -1201,6 +1206,10 @@ impl PageServerHandler {
|
||||
))
|
||||
}
|
||||
|
||||
/// Note on "fullbackup":
|
||||
/// Full basebackups should only be used for debugging purposes.
|
||||
/// Originally, it was introduced to enable breaking storage format changes,
|
||||
/// but that is not applicable anymore.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
|
||||
async fn handle_basebackup_request<IO>(
|
||||
|
||||
@@ -252,16 +252,8 @@ impl Timeline {
|
||||
let mut buf = version.get(self, key, ctx).await?;
|
||||
let nblocks = buf.get_u32_le();
|
||||
|
||||
if latest {
|
||||
// Update relation size cache only if "latest" flag is set.
|
||||
// This flag is set by compute when it is working with most recent version of relation.
|
||||
// Typically master compute node always set latest=true.
|
||||
// Please notice, that even if compute node "by mistake" specifies old LSN but set
|
||||
// latest=true, then it can not cause cache corruption, because with latest=true
|
||||
// pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
|
||||
// associated with most recent value of LSN.
|
||||
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||
}
|
||||
self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
|
||||
|
||||
Ok(nblocks)
|
||||
}
|
||||
|
||||
@@ -817,7 +809,7 @@ impl Timeline {
|
||||
/// Get cached size of relation if it not updated after specified LSN
|
||||
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
|
||||
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
|
||||
if lsn >= *cached_lsn {
|
||||
return Some(*nblocks);
|
||||
}
|
||||
@@ -828,7 +820,16 @@ impl Timeline {
|
||||
/// Update cached relation size if there is no more recent update
|
||||
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
match rel_size_cache.entry(tag) {
|
||||
|
||||
if lsn < rel_size_cache.complete_as_of {
|
||||
// Do not cache old values. It's safe to cache the size on read, as long as
|
||||
// the read was at an LSN since we started the WAL ingestion. Reasoning: we
|
||||
// never evict values from the cache, so if the relation size changed after
|
||||
// 'lsn', the new value is already in the cache.
|
||||
return;
|
||||
}
|
||||
|
||||
match rel_size_cache.map.entry(tag) {
|
||||
hash_map::Entry::Occupied(mut entry) => {
|
||||
let cached_lsn = entry.get_mut();
|
||||
if lsn >= cached_lsn.0 {
|
||||
@@ -844,13 +845,13 @@ impl Timeline {
|
||||
/// Store cached relation size
|
||||
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.insert(tag, (lsn, nblocks));
|
||||
rel_size_cache.map.insert(tag, (lsn, nblocks));
|
||||
}
|
||||
|
||||
/// Remove cached relation size
|
||||
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.remove(tag);
|
||||
rel_size_cache.map.remove(tag);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1401,7 +1402,7 @@ impl<'a> DatadirModification<'a> {
|
||||
let n_files;
|
||||
let mut aux_files = self.tline.aux_files.lock().await;
|
||||
if let Some(mut dir) = aux_files.dir.take() {
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value
|
||||
// We already updated aux files in `self`: emit a delta and update our latest value.
|
||||
dir.upsert(file_path.clone(), content.clone());
|
||||
n_files = dir.files.len();
|
||||
if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
|
||||
@@ -1446,10 +1447,14 @@ impl<'a> DatadirModification<'a> {
|
||||
// reset the map.
|
||||
return Err(e.into());
|
||||
}
|
||||
// FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
|
||||
// we are assuming that all _other_ possible errors represents a missing key. If some
|
||||
// other error occurs, we may incorrectly reset the map of aux files.
|
||||
Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
|
||||
// Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
|
||||
// the original code assumes all other errors are missing keys. Therefore, we keep the code path
|
||||
// the same for now, though in theory, we should only match the `MissingKey` variant.
|
||||
Err(
|
||||
PageReconstructError::Other(_)
|
||||
| PageReconstructError::WalRedo(_)
|
||||
| PageReconstructError::MissingKey { .. },
|
||||
) => {
|
||||
// Key is missing, we must insert an image as the basis for subsequent deltas.
|
||||
|
||||
let mut dir = AuxFilesDirectory {
|
||||
|
||||
@@ -33,6 +33,52 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) enum InvalidInput {
|
||||
TooShortValue,
|
||||
TooShortPostgresRecord,
|
||||
}
|
||||
|
||||
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||
/// use this type for querying if a slice looks some particular way.
|
||||
#[cfg(test)]
|
||||
pub(crate) struct ValueBytes;
|
||||
|
||||
#[cfg(test)]
|
||||
impl ValueBytes {
|
||||
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
return Err(InvalidInput::TooShortValue);
|
||||
}
|
||||
|
||||
let value_discriminator = &raw[0..4];
|
||||
|
||||
if value_discriminator == [0, 0, 0, 0] {
|
||||
// Value::Image always initializes
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
if value_discriminator != [0, 0, 0, 1] {
|
||||
// not a Value::WalRecord(..)
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let walrecord_discriminator = &raw[4..8];
|
||||
|
||||
if walrecord_discriminator != [0, 0, 0, 0] {
|
||||
// only NeonWalRecord::Postgres can have will_init
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
if raw.len() < 17 {
|
||||
return Err(InvalidInput::TooShortPostgresRecord);
|
||||
}
|
||||
|
||||
Ok(raw[8] == 1)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
@@ -70,6 +116,8 @@ mod test {
|
||||
];
|
||||
|
||||
roundtrip!(image, expected);
|
||||
|
||||
assert!(ValueBytes::will_init(&expected).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -93,6 +141,96 @@ mod test {
|
||||
];
|
||||
|
||||
roundtrip!(rec, expected);
|
||||
|
||||
assert!(ValueBytes::will_init(&expected).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bytes_inspection_too_short_image() {
|
||||
let rec = Value::Image(Bytes::from_static(b""));
|
||||
|
||||
#[rustfmt::skip]
|
||||
let expected = [
|
||||
// top level discriminator of 4 bytes
|
||||
0x00, 0x00, 0x00, 0x00,
|
||||
// 8 byte length
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
roundtrip!(rec, expected);
|
||||
|
||||
assert!(ValueBytes::will_init(&expected).unwrap());
|
||||
assert_eq!(expected.len(), 12);
|
||||
for len in 0..12 {
|
||||
assert_eq!(
|
||||
ValueBytes::will_init(&expected[..len]).unwrap_err(),
|
||||
InvalidInput::TooShortValue
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bytes_inspection_too_short_postgres_record() {
|
||||
let rec = NeonWalRecord::Postgres {
|
||||
will_init: false,
|
||||
rec: Bytes::from_static(b""),
|
||||
};
|
||||
let rec = Value::WalRecord(rec);
|
||||
|
||||
#[rustfmt::skip]
|
||||
let expected = [
|
||||
// flattened discriminator of total 8 bytes
|
||||
0x00, 0x00, 0x00, 0x01,
|
||||
0x00, 0x00, 0x00, 0x00,
|
||||
// will_init
|
||||
0x00,
|
||||
// 8 byte length
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
roundtrip!(rec, expected);
|
||||
|
||||
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||
assert_eq!(expected.len(), 17);
|
||||
for len in 12..17 {
|
||||
assert_eq!(
|
||||
ValueBytes::will_init(&expected[..len]).unwrap_err(),
|
||||
InvalidInput::TooShortPostgresRecord
|
||||
)
|
||||
}
|
||||
for len in 0..12 {
|
||||
assert_eq!(
|
||||
ValueBytes::will_init(&expected[..len]).unwrap_err(),
|
||||
InvalidInput::TooShortValue
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clear_visibility_map_flags_example() {
|
||||
let rec = NeonWalRecord::ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Some(0x11),
|
||||
old_heap_blkno: None,
|
||||
flags: 0x03,
|
||||
};
|
||||
let rec = Value::WalRecord(rec);
|
||||
|
||||
#[rustfmt::skip]
|
||||
let expected = [
|
||||
// discriminators
|
||||
0x00, 0x00, 0x00, 0x01,
|
||||
0x00, 0x00, 0x00, 0x01,
|
||||
// Some == 1 followed by 4 bytes
|
||||
0x01, 0x00, 0x00, 0x00, 0x11,
|
||||
// None == 0
|
||||
0x00,
|
||||
// flags
|
||||
0x03
|
||||
];
|
||||
|
||||
roundtrip!(rec, expected);
|
||||
|
||||
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -386,7 +386,7 @@ impl WalRedoManager {
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
match self {
|
||||
WalRedoManager::Prod(m) => m.status(),
|
||||
WalRedoManager::Prod(m) => Some(m.status()),
|
||||
#[cfg(test)]
|
||||
WalRedoManager::Test(_) => None,
|
||||
}
|
||||
@@ -559,9 +559,10 @@ impl Tenant {
|
||||
// By doing what we do here, the index part upload is retried.
|
||||
// If control plane retries timeline creation in the meantime, the mgmt API handler
|
||||
// for timeline creation will coalesce on the upload we queue here.
|
||||
// FIXME: this branch should be dead code as we no longer write local metadata.
|
||||
let rtc = timeline.remote_client.as_ref().unwrap();
|
||||
rtc.init_upload_queue_for_empty_remote(&metadata)?;
|
||||
rtc.schedule_index_upload_for_metadata_update(&metadata)?;
|
||||
rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
|
||||
}
|
||||
|
||||
timeline
|
||||
@@ -2869,20 +2870,23 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline_id, Lsn(0))),
|
||||
Included((timeline_id, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
timeline
|
||||
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
|
||||
.await?;
|
||||
let cutoff = timeline
|
||||
.get_last_record_lsn()
|
||||
.checked_sub(horizon)
|
||||
.unwrap_or(Lsn(0));
|
||||
|
||||
gc_timelines.push(timeline);
|
||||
}
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline_id, Lsn(0))),
|
||||
Included((timeline_id, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
timeline
|
||||
.update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
|
||||
.await?;
|
||||
|
||||
gc_timelines.push(timeline);
|
||||
}
|
||||
drop(gc_cs);
|
||||
Ok(gc_timelines)
|
||||
@@ -3027,7 +3031,7 @@ impl Tenant {
|
||||
// See also https://github.com/neondatabase/neon/issues/3865
|
||||
if let Some(remote_client) = new_timeline.remote_client.as_ref() {
|
||||
remote_client
|
||||
.schedule_index_upload_for_metadata_update(&metadata)
|
||||
.schedule_index_upload_for_full_metadata_update(&metadata)
|
||||
.context("branch initial metadata upload")?;
|
||||
}
|
||||
|
||||
@@ -3190,7 +3194,7 @@ impl Tenant {
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if self.tenant_shard_id().is_zero() {
|
||||
if self.tenant_shard_id().is_shard_zero() {
|
||||
self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
|
||||
.await?;
|
||||
}
|
||||
@@ -3437,7 +3441,7 @@ impl Tenant {
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
// Only shard zero should be calculating synthetic sizes
|
||||
debug_assert!(self.shard_identity.is_zero());
|
||||
debug_assert!(self.shard_identity.is_shard_zero());
|
||||
|
||||
TENANT_SYNTHETIC_SIZE_METRIC
|
||||
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
|
||||
@@ -3848,6 +3852,8 @@ pub(crate) mod harness {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::{Key, Value};
|
||||
@@ -3856,9 +3862,10 @@ mod tests {
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::key::NON_INHERITED_RANGE;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tests::timeline::ShutdownMode;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
@@ -4655,6 +4662,62 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_aux_files")?;
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let tline = tenant
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
let tline = tline.raw_timeline().unwrap();
|
||||
|
||||
let mut modification = tline.begin_modification(Lsn(0x1000));
|
||||
modification.put_file("foo/bar1", b"content1", &ctx).await?;
|
||||
modification.set_lsn(Lsn(0x1008))?;
|
||||
modification.put_file("foo/bar2", b"content2", &ctx).await?;
|
||||
modification.commit(&ctx).await?;
|
||||
|
||||
let child_timeline_id = TimelineId::generate();
|
||||
tenant
|
||||
.branch_timeline_test(
|
||||
tline,
|
||||
child_timeline_id,
|
||||
Some(tline.get_last_record_lsn()),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let child_timeline = tenant
|
||||
.get_timeline(child_timeline_id, true)
|
||||
.expect("Should have the branched timeline");
|
||||
|
||||
let aux_keyspace = KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE],
|
||||
};
|
||||
let read_lsn = child_timeline.get_last_record_lsn();
|
||||
|
||||
let vectored_res = child_timeline
|
||||
.get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
child_timeline
|
||||
.validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
|
||||
.await;
|
||||
|
||||
let images = vectored_res?;
|
||||
let mut key = NON_INHERITED_RANGE.start;
|
||||
while key < NON_INHERITED_RANGE.end {
|
||||
assert!(matches!(
|
||||
images[&key],
|
||||
Err(PageReconstructError::MissingKey(_))
|
||||
));
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test that vectored get handles layer gaps correctly
|
||||
// by advancing into the next ancestor timeline if required.
|
||||
//
|
||||
@@ -4794,6 +4857,166 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Test that vectored get descends into ancestor timelines correctly and
|
||||
// does not return an image that's newer than requested.
|
||||
//
|
||||
// The diagram below ilustrates an interesting case. We have a parent timeline
|
||||
// (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
|
||||
// from the child timeline, so the parent timeline must be visited. When advacing into
|
||||
// the child timeline, the read path needs to remember what the requested Lsn was in
|
||||
// order to avoid returning an image that's too new. The test below constructs such
|
||||
// a timeline setup and does a few queries around the Lsn of each page image.
|
||||
// ```
|
||||
// LSN
|
||||
// ^
|
||||
// |
|
||||
// |
|
||||
// 500 | --------------------------------------> branch point
|
||||
// 400 | X
|
||||
// 300 | X
|
||||
// 200 | --------------------------------------> requested lsn
|
||||
// 100 | X
|
||||
// |---------------------------------------> Key
|
||||
// |
|
||||
// ------> requested key
|
||||
//
|
||||
// Legend:
|
||||
// * X - page images
|
||||
// ```
|
||||
#[tokio::test]
|
||||
async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
|
||||
let end_key = start_key.add(1000);
|
||||
let child_gap_at_key = start_key.add(500);
|
||||
let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
|
||||
|
||||
let mut current_lsn = Lsn(0x10);
|
||||
|
||||
let timeline_id = TimelineId::generate();
|
||||
let parent_timeline = tenant
|
||||
.create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
|
||||
.await?;
|
||||
|
||||
current_lsn += 0x100;
|
||||
|
||||
for _ in 0..3 {
|
||||
let mut key = start_key;
|
||||
while key < end_key {
|
||||
current_lsn += 0x10;
|
||||
|
||||
let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
|
||||
|
||||
let mut writer = parent_timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
key,
|
||||
current_lsn,
|
||||
&Value::Image(test_img(&image_value)),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(current_lsn);
|
||||
|
||||
if key == child_gap_at_key {
|
||||
parent_gap_lsns.insert(current_lsn, image_value);
|
||||
}
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
parent_timeline.freeze_and_flush().await?;
|
||||
}
|
||||
|
||||
let child_timeline_id = TimelineId::generate();
|
||||
|
||||
let child_timeline = tenant
|
||||
.branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
|
||||
.await?;
|
||||
|
||||
let mut key = start_key;
|
||||
while key < end_key {
|
||||
if key == child_gap_at_key {
|
||||
key = key.next();
|
||||
continue;
|
||||
}
|
||||
|
||||
current_lsn += 0x10;
|
||||
|
||||
let mut writer = child_timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
key,
|
||||
current_lsn,
|
||||
&Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
writer.finish_write(current_lsn);
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
|
||||
child_timeline.freeze_and_flush().await?;
|
||||
|
||||
let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
|
||||
let mut query_lsns = Vec::new();
|
||||
for image_lsn in parent_gap_lsns.keys().rev() {
|
||||
for offset in lsn_offsets {
|
||||
query_lsns.push(Lsn(image_lsn
|
||||
.0
|
||||
.checked_add_signed(offset)
|
||||
.expect("Shouldn't overflow")));
|
||||
}
|
||||
}
|
||||
|
||||
for query_lsn in query_lsns {
|
||||
let results = child_timeline
|
||||
.get_vectored_impl(
|
||||
KeySpace {
|
||||
ranges: vec![child_gap_at_key..child_gap_at_key.next()],
|
||||
},
|
||||
query_lsn,
|
||||
&ctx,
|
||||
)
|
||||
.await;
|
||||
|
||||
let expected_item = parent_gap_lsns
|
||||
.iter()
|
||||
.rev()
|
||||
.find(|(lsn, _)| **lsn <= query_lsn);
|
||||
|
||||
info!(
|
||||
"Doing vectored read at LSN {}. Expecting image to be: {:?}",
|
||||
query_lsn, expected_item
|
||||
);
|
||||
|
||||
match expected_item {
|
||||
Some((_, img_value)) => {
|
||||
let key_results = results.expect("No vectored get error expected");
|
||||
let key_result = &key_results[&child_gap_at_key];
|
||||
let returned_img = key_result
|
||||
.as_ref()
|
||||
.expect("No page reconstruct error expected");
|
||||
|
||||
info!(
|
||||
"Vectored read at LSN {} returned image {}",
|
||||
query_lsn,
|
||||
std::str::from_utf8(returned_img)?
|
||||
);
|
||||
assert_eq!(*returned_img, test_img(img_value));
|
||||
}
|
||||
None => {
|
||||
assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_random_updates() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_random_updates")?;
|
||||
|
||||
@@ -436,6 +436,11 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Check whether background deletion of this tenant is currently in progress
|
||||
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
|
||||
tenant.delete_progress.try_lock().is_err()
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
|
||||
@@ -235,6 +235,12 @@ impl TimelineMetadata {
|
||||
let bytes = instance.to_bytes().unwrap();
|
||||
Self::from_bytes(&bytes).unwrap()
|
||||
}
|
||||
|
||||
pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
|
||||
self.body.disk_consistent_lsn = update.disk_consistent_lsn;
|
||||
self.body.prev_record_lsn = update.prev_record_lsn;
|
||||
self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for TimelineMetadata {
|
||||
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of the metadata which are regularly modified.
|
||||
pub(crate) struct MetadataUpdate {
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl MetadataUpdate {
|
||||
pub(crate) fn new(
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -678,12 +678,19 @@ pub async fn init_tenant_mgr(
|
||||
}
|
||||
}
|
||||
}
|
||||
LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
location_conf.tenant_conf,
|
||||
&secondary_conf,
|
||||
)),
|
||||
LocationMode::Secondary(secondary_conf) => {
|
||||
info!(
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug(),
|
||||
"Starting secondary tenant"
|
||||
);
|
||||
TenantSlot::Secondary(SecondaryTenant::new(
|
||||
tenant_shard_id,
|
||||
shard_identity,
|
||||
location_conf.tenant_conf,
|
||||
&secondary_conf,
|
||||
))
|
||||
}
|
||||
};
|
||||
|
||||
tenants.insert(tenant_shard_id, slot);
|
||||
@@ -1410,9 +1417,15 @@ impl TenantManager {
|
||||
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If a tenant is broken or stopping, DeleteTenantFlow can
|
||||
// handle it: broken tenants proceed to delete, stopping tenants
|
||||
// are checked for deletion already in progress.
|
||||
// If deletion is already in progress, return success (the semantics of this
|
||||
// function are to rerturn success afterr deletion is spawned in background).
|
||||
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
|
||||
if DeleteTenantFlow::is_in_progress(&tenant) {
|
||||
// The `delete_progress` lock is held: deletion is already happening
|
||||
// in the bacckground
|
||||
slot_guard.revert();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
tenant
|
||||
|
||||
@@ -202,7 +202,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use remote_storage::{
|
||||
DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
|
||||
};
|
||||
use std::ops::DerefMut;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use tracing::{info_span, Instrument};
|
||||
@@ -236,6 +238,7 @@ use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use self::index::IndexPart;
|
||||
|
||||
use super::metadata::MetadataUpdate;
|
||||
use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
|
||||
use super::upload_queue::SetDeletedFlagProgress;
|
||||
use super::Generation;
|
||||
@@ -536,9 +539,10 @@ impl RemoteTimelineClient {
|
||||
// Upload operations.
|
||||
//
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, with
|
||||
/// updated metadata.
|
||||
/// fully updated metadata.
|
||||
///
|
||||
/// This should only be used to upload initial metadata to remote storage.
|
||||
///
|
||||
/// The upload will be added to the queue immediately, but it
|
||||
/// won't be performed until all previously scheduled layer file
|
||||
@@ -550,7 +554,7 @@ impl RemoteTimelineClient {
|
||||
/// If there were any changes to the list of files, i.e. if any
|
||||
/// layer file uploads were scheduled, since the last index file
|
||||
/// upload, those will be included too.
|
||||
pub fn schedule_index_upload_for_metadata_update(
|
||||
pub fn schedule_index_upload_for_full_metadata_update(
|
||||
self: &Arc<Self>,
|
||||
metadata: &TimelineMetadata,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -566,6 +570,27 @@ impl RemoteTimelineClient {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Launch an index-file upload operation in the background, with only parts of the metadata
|
||||
/// updated.
|
||||
///
|
||||
/// This is the regular way of updating metadata on layer flushes or Gc.
|
||||
///
|
||||
/// Using this lighter update mechanism allows for reparenting and detaching without changes to
|
||||
/// `index_part.json`, while being more clear on what values update regularly.
|
||||
pub(crate) fn schedule_index_upload_for_metadata_update(
|
||||
self: &Arc<Self>,
|
||||
update: &MetadataUpdate,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut guard = self.upload_queue.lock().unwrap();
|
||||
let upload_queue = guard.initialized_mut()?;
|
||||
|
||||
upload_queue.latest_metadata.apply(update);
|
||||
|
||||
self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Launch an index-file upload operation in the background, if necessary.
|
||||
///
|
||||
@@ -1122,7 +1147,7 @@ impl RemoteTimelineClient {
|
||||
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
|
||||
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||
|
||||
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
|
||||
// Execute all pending deletions, so that when we proceed to do a listing below, we aren't
|
||||
// taking the burden of listing all the layers that we already know we should delete.
|
||||
self.flush_deletion_queue().await?;
|
||||
|
||||
@@ -1131,14 +1156,20 @@ impl RemoteTimelineClient {
|
||||
let remaining = download_retry(
|
||||
|| async {
|
||||
self.storage_impl
|
||||
.list_files(Some(&timeline_storage_path), None, &cancel)
|
||||
.list(
|
||||
Some(&timeline_storage_path),
|
||||
ListingMode::NoDelimiter,
|
||||
None,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
},
|
||||
"list remaining files",
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.context("list files remaining files")?;
|
||||
.context("list files remaining files")?
|
||||
.keys;
|
||||
|
||||
// We will delete the current index_part object last, since it acts as a deletion
|
||||
// marker via its deleted_at attribute
|
||||
@@ -2024,7 +2055,7 @@ mod tests {
|
||||
// Schedule upload of index. Check that it is queued
|
||||
let metadata = dummy_metadata(Lsn(0x20));
|
||||
client
|
||||
.schedule_index_upload_for_metadata_update(&metadata)
|
||||
.schedule_index_upload_for_full_metadata_update(&metadata)
|
||||
.unwrap();
|
||||
{
|
||||
let mut guard = client.upload_queue.lock().unwrap();
|
||||
|
||||
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
|
||||
tenant_shard_id: TenantShardId,
|
||||
cancel: CancellationToken,
|
||||
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
|
||||
let remote_path = remote_timelines_path(&tenant_shard_id);
|
||||
let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();
|
||||
|
||||
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
|
||||
anyhow::bail!("storage-sync-list-remote-timelines");
|
||||
@@ -417,11 +417,16 @@ pub(super) async fn download_index_part(
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
|
||||
let indices = download_retry(
|
||||
|| async { storage.list_files(Some(&index_prefix), None, cancel).await },
|
||||
|| async {
|
||||
storage
|
||||
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.await
|
||||
},
|
||||
"list index_part files",
|
||||
cancel,
|
||||
)
|
||||
.await?;
|
||||
.await?
|
||||
.keys;
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
|
||||
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
|
||||
let warn_after = 3;
|
||||
let max_attempts = 10;
|
||||
let mut prefixes = Vec::with_capacity(2);
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
// Also recover the unsharded prefix for a shard of zero:
|
||||
// - if the tenant is totally unsharded, the unsharded prefix contains all the data
|
||||
// - if the tenant is sharded, we still want to recover the initdb data, but we only
|
||||
|
||||
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
|
||||
(detail.last_download, detail.next_download.unwrap())
|
||||
};
|
||||
|
||||
if now < next_download {
|
||||
if now > next_download {
|
||||
Some(PendingDownload {
|
||||
secondary_state: secondary_tenant,
|
||||
last_download,
|
||||
@@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> {
|
||||
progress.bytes_downloaded += layer_byte_count;
|
||||
progress.layers_downloaded += layer_count;
|
||||
}
|
||||
|
||||
for delete_timeline in &delete_timelines {
|
||||
// We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
|
||||
// from disk fails that will be a fatal error.
|
||||
detail.timelines.remove(delete_timeline);
|
||||
}
|
||||
}
|
||||
|
||||
// Execute accumulated deletions
|
||||
@@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> {
|
||||
.await
|
||||
.map_err(UpdateError::from)?;
|
||||
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
|
||||
if Some(&download.etag) == prev_etag {
|
||||
Ok(HeatMapDownload::Unmodified)
|
||||
} else {
|
||||
let mut heatmap_bytes = Vec::new();
|
||||
let mut body = tokio_util::io::StreamReader::new(download.download_stream);
|
||||
let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
|
||||
SECONDARY_MODE.download_heatmap.inc();
|
||||
Ok(HeatMapDownload::Modified(HeatMapModified {
|
||||
etag: download.etag,
|
||||
last_modified: download.last_modified,
|
||||
|
||||
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
|
||||
pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,
|
||||
|
||||
keys_done: KeySpaceRandomAccum,
|
||||
layers_visited: u32,
|
||||
}
|
||||
|
||||
impl ValuesReconstructState {
|
||||
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
|
||||
Self {
|
||||
keys: HashMap::new(),
|
||||
keys_done: KeySpaceRandomAccum::new(),
|
||||
layers_visited: 0,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,6 +140,14 @@ impl ValuesReconstructState {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn on_layer_visited(&mut self) {
|
||||
self.layers_visited += 1;
|
||||
}
|
||||
|
||||
pub(crate) fn get_layers_visited(&self) -> u32 {
|
||||
self.layers_visited
|
||||
}
|
||||
|
||||
/// Update the state collected for a given key.
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
|
||||
@@ -20,8 +20,8 @@
|
||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
//! ```
|
||||
//!
|
||||
//! Every delta file consists of three parts: "summary", "index", and
|
||||
//! "values". The summary is a fixed size header at the beginning of the file,
|
||||
//! Every delta file consists of three parts: "summary", "values", and
|
||||
//! "index". The summary is a fixed size header at the beginning of the file,
|
||||
//! and it contains basic information about the layer, and offsets to the other
|
||||
//! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
|
||||
//! "values" part. The actual page images and WAL records are stored in the
|
||||
@@ -728,6 +728,9 @@ impl DeltaLayerInner {
|
||||
// production code path
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
// mask out the timeline_id, but still require the layers to be from the same tenant
|
||||
expected_summary.timeline_id = actual_summary.timeline_id;
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
"in-file summary does not match expected summary. actual = {:?} expected = {:?}",
|
||||
@@ -863,7 +866,7 @@ impl DeltaLayerInner {
|
||||
.into(),
|
||||
);
|
||||
|
||||
let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
let data_end_offset = self.index_start_offset();
|
||||
|
||||
let reads = Self::plan_reads(
|
||||
keyspace,
|
||||
@@ -939,7 +942,7 @@ impl DeltaLayerInner {
|
||||
}
|
||||
|
||||
if !range_end_handled {
|
||||
tracing::info!("Handling range end fallback at {}", data_end_offset);
|
||||
tracing::debug!("Handling range end fallback at {}", data_end_offset);
|
||||
planner.handle_range_end(data_end_offset);
|
||||
}
|
||||
}
|
||||
@@ -1103,11 +1106,195 @@ impl DeltaLayerInner {
|
||||
if let Some(last) = all_keys.last_mut() {
|
||||
// Last key occupies all space till end of value storage,
|
||||
// which corresponds to beginning of the index
|
||||
last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
|
||||
last.size = self.index_start_offset() - last.size;
|
||||
}
|
||||
Ok(all_keys)
|
||||
}
|
||||
|
||||
/// Using the given writer, write out a truncated version, where LSNs higher than the
|
||||
/// truncate_at are missing.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn copy_prefix(
|
||||
&self,
|
||||
writer: &mut DeltaLayerWriter,
|
||||
truncate_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use crate::tenant::vectored_blob_io::{
|
||||
BlobMeta, VectoredReadBuilder, VectoredReadExtended,
|
||||
};
|
||||
use futures::stream::TryStreamExt;
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Item {
|
||||
Actual(Key, Lsn, BlobRef),
|
||||
Sentinel,
|
||||
}
|
||||
|
||||
impl From<Item> for Option<(Key, Lsn, BlobRef)> {
|
||||
fn from(value: Item) -> Self {
|
||||
match value {
|
||||
Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
|
||||
Item::Sentinel => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Item {
|
||||
fn offset(&self) -> Option<BlobRef> {
|
||||
match self {
|
||||
Item::Actual(_, _, blob) => Some(*blob),
|
||||
Item::Sentinel => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn is_last(&self) -> bool {
|
||||
matches!(self, Item::Sentinel)
|
||||
}
|
||||
}
|
||||
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
self.index_start_blk,
|
||||
self.index_root_blk,
|
||||
block_reader,
|
||||
);
|
||||
|
||||
let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
|
||||
let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
|
||||
// put in a sentinel value for getting the end offset for last item, and not having to
|
||||
// repeat the whole read part
|
||||
let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
|
||||
Item::Sentinel,
|
||||
))));
|
||||
let mut stream = std::pin::pin!(stream);
|
||||
|
||||
let mut prev: Option<(Key, Lsn, BlobRef)> = None;
|
||||
|
||||
let mut read_builder: Option<VectoredReadBuilder> = None;
|
||||
|
||||
let max_read_size = self
|
||||
.max_vectored_read_bytes
|
||||
.map(|x| x.0.get())
|
||||
.unwrap_or(8192);
|
||||
|
||||
let mut buffer = Some(BytesMut::with_capacity(max_read_size));
|
||||
|
||||
// FIXME: buffering of DeltaLayerWriter
|
||||
let mut per_blob_copy = Vec::new();
|
||||
|
||||
while let Some(item) = stream.try_next().await? {
|
||||
tracing::debug!(?item, "popped");
|
||||
let offset = item
|
||||
.offset()
|
||||
.unwrap_or(BlobRef::new(self.index_start_offset(), false));
|
||||
|
||||
let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
|
||||
let end_offset = offset;
|
||||
|
||||
Some((BlobMeta { key, lsn }, start_offset..end_offset))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let is_last = item.is_last();
|
||||
|
||||
prev = Option::from(item);
|
||||
|
||||
let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
|
||||
|
||||
let builder = if let Some((meta, offsets)) = actionable {
|
||||
// extend or create a new builder
|
||||
if read_builder
|
||||
.as_mut()
|
||||
.map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
|
||||
.unwrap_or(VectoredReadExtended::No)
|
||||
== VectoredReadExtended::Yes
|
||||
{
|
||||
None
|
||||
} else {
|
||||
read_builder.replace(VectoredReadBuilder::new(
|
||||
offsets.start.pos(),
|
||||
offsets.end.pos(),
|
||||
meta,
|
||||
max_read_size,
|
||||
))
|
||||
}
|
||||
} else {
|
||||
// nothing to do, except perhaps flush any existing for the last element
|
||||
None
|
||||
};
|
||||
|
||||
// flush the possible older builder and also the new one if the item was the last one
|
||||
let builders = builder.into_iter();
|
||||
let builders = if is_last {
|
||||
builders.chain(read_builder.take())
|
||||
} else {
|
||||
builders.chain(None)
|
||||
};
|
||||
|
||||
for builder in builders {
|
||||
let read = builder.build();
|
||||
|
||||
let reader = VectoredBlobReader::new(&self.file);
|
||||
|
||||
let mut buf = buffer.take().unwrap();
|
||||
|
||||
buf.clear();
|
||||
buf.reserve(read.size());
|
||||
let res = reader.read_blobs(&read, buf).await?;
|
||||
|
||||
for blob in res.blobs {
|
||||
let key = blob.meta.key;
|
||||
let lsn = blob.meta.lsn;
|
||||
let data = &res.buf[blob.start..blob.end];
|
||||
|
||||
#[cfg(debug_assertions)]
|
||||
Value::des(data)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"blob failed to deserialize for {}@{}, {}..{}: {:?}",
|
||||
blob.meta.key,
|
||||
blob.meta.lsn,
|
||||
blob.start,
|
||||
blob.end,
|
||||
utils::Hex(data)
|
||||
)
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = crate::repository::ValueBytes::will_init(data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
|
||||
})
|
||||
.unwrap_or(false);
|
||||
|
||||
per_blob_copy.clear();
|
||||
per_blob_copy.extend_from_slice(data);
|
||||
|
||||
let (tmp, res) = writer
|
||||
.put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
|
||||
.await;
|
||||
per_blob_copy = tmp;
|
||||
res?;
|
||||
}
|
||||
|
||||
buffer = Some(res.buf);
|
||||
}
|
||||
}
|
||||
|
||||
assert!(
|
||||
read_builder.is_none(),
|
||||
"with the sentinel above loop should had handled all"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
println!(
|
||||
"index_start_blk: {}, root {}",
|
||||
@@ -1177,6 +1364,44 @@ impl DeltaLayerInner {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn stream_index_forwards<'a, R>(
|
||||
&'a self,
|
||||
reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
|
||||
start: &'a [u8; DELTA_KEY_SIZE],
|
||||
ctx: &'a RequestContext,
|
||||
) -> impl futures::stream::Stream<
|
||||
Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
|
||||
> + 'a
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
use futures::stream::TryStreamExt;
|
||||
let stream = reader.get_stream_from(start, ctx);
|
||||
stream.map_ok(|(key, value)| {
|
||||
let key = DeltaKey::from_slice(&key);
|
||||
let (key, lsn) = (key.key(), key.lsn());
|
||||
let offset = BlobRef(value);
|
||||
|
||||
(key, lsn, offset)
|
||||
})
|
||||
}
|
||||
|
||||
/// The file offset to the first block of index.
|
||||
///
|
||||
/// The file structure is summary, values, and index. We often need this for the size of last blob.
|
||||
fn index_start_offset(&self) -> u64 {
|
||||
let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
|
||||
let bref = BlobRef(offset);
|
||||
tracing::debug!(
|
||||
index_start_blk = self.index_start_blk,
|
||||
offset,
|
||||
pos = bref.pos(),
|
||||
"index_start_offset"
|
||||
);
|
||||
offset
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of data associated with a delta layer key and its value
|
||||
@@ -1538,7 +1763,7 @@ mod test {
|
||||
|
||||
let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;
|
||||
|
||||
let inner = resident.get_inner_delta(&ctx).await?;
|
||||
let inner = resident.as_delta(&ctx).await?;
|
||||
|
||||
let file_size = inner.file.metadata().await?.len();
|
||||
tracing::info!(
|
||||
@@ -1594,4 +1819,217 @@ mod test {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn copy_delta_prefix_smoke() {
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
let ctx = &ctx;
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let initdb_layer = timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.next()
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
let mut writer = timeline.writer().await;
|
||||
|
||||
let data = [
|
||||
(0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
|
||||
(
|
||||
0x30,
|
||||
12,
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: false,
|
||||
rec: Bytes::from_static(b"1"),
|
||||
}),
|
||||
),
|
||||
(
|
||||
0x40,
|
||||
12,
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: true,
|
||||
rec: Bytes::from_static(b"2"),
|
||||
}),
|
||||
),
|
||||
// build an oversized value so we cannot extend and existing read over
|
||||
// this
|
||||
(
|
||||
0x50,
|
||||
12,
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: true,
|
||||
rec: {
|
||||
let mut buf =
|
||||
vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
|
||||
buf.iter_mut()
|
||||
.enumerate()
|
||||
.for_each(|(i, slot)| *slot = (i % 256) as u8);
|
||||
Bytes::from(buf)
|
||||
},
|
||||
}),
|
||||
),
|
||||
// because the oversized read cannot be extended further, we are sure to exercise the
|
||||
// builder created on the last round with this:
|
||||
(
|
||||
0x60,
|
||||
12,
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: true,
|
||||
rec: Bytes::from_static(b"3"),
|
||||
}),
|
||||
),
|
||||
(
|
||||
0x60,
|
||||
9,
|
||||
Value::Image(Bytes::from_static(b"something for a different key")),
|
||||
),
|
||||
];
|
||||
|
||||
let mut last_lsn = None;
|
||||
|
||||
for (lsn, key, value) in data {
|
||||
let key = Key::from_i128(key);
|
||||
writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
|
||||
last_lsn = Some(lsn);
|
||||
}
|
||||
|
||||
writer.finish_write(Lsn(last_lsn.unwrap()));
|
||||
}
|
||||
timeline.freeze_and_flush().await.unwrap();
|
||||
|
||||
let new_layer = timeline
|
||||
.layers
|
||||
.read()
|
||||
.await
|
||||
.likely_resident_layers()
|
||||
.find(|x| x != &initdb_layer)
|
||||
.unwrap();
|
||||
|
||||
// create a copy for the timeline, so we don't overwrite the file
|
||||
let branch = tenant
|
||||
.branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
|
||||
|
||||
// truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
|
||||
// a single key
|
||||
|
||||
for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
|
||||
let truncate_at = Lsn(truncate_at);
|
||||
|
||||
let mut writer = DeltaLayerWriter::new(
|
||||
tenant.conf,
|
||||
branch.timeline_id,
|
||||
tenant.tenant_shard_id,
|
||||
Key::MIN,
|
||||
Lsn(0x11)..truncate_at,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let new_layer = new_layer.download_and_keep_resident().await.unwrap();
|
||||
|
||||
new_layer
|
||||
.copy_delta_prefix(&mut writer, truncate_at, ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
|
||||
|
||||
copied_layer.as_delta(ctx).await.unwrap();
|
||||
|
||||
assert_keys_and_values_eq(
|
||||
new_layer.as_delta(ctx).await.unwrap(),
|
||||
copied_layer.as_delta(ctx).await.unwrap(),
|
||||
truncate_at,
|
||||
ctx,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
async fn assert_keys_and_values_eq(
|
||||
source: &DeltaLayerInner,
|
||||
truncated: &DeltaLayerInner,
|
||||
truncated_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
use futures::future::ready;
|
||||
use futures::stream::TryStreamExt;
|
||||
|
||||
let start_key = [0u8; DELTA_KEY_SIZE];
|
||||
|
||||
let source_reader = FileBlockReader::new(&source.file, source.file_id);
|
||||
let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
source.index_start_blk,
|
||||
source.index_root_blk,
|
||||
&source_reader,
|
||||
);
|
||||
let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
|
||||
let source_stream = source_stream.filter(|res| match res {
|
||||
Ok((_, lsn, _)) => ready(lsn < &truncated_at),
|
||||
_ => ready(true),
|
||||
});
|
||||
let mut source_stream = std::pin::pin!(source_stream);
|
||||
|
||||
let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
|
||||
let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
truncated.index_start_blk,
|
||||
truncated.index_root_blk,
|
||||
&truncated_reader,
|
||||
);
|
||||
let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
|
||||
let mut truncated_stream = std::pin::pin!(truncated_stream);
|
||||
|
||||
let mut scratch_left = Vec::new();
|
||||
let mut scratch_right = Vec::new();
|
||||
|
||||
loop {
|
||||
let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
|
||||
let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
|
||||
|
||||
if src.is_none() {
|
||||
assert!(truncated.is_none());
|
||||
break;
|
||||
}
|
||||
|
||||
let (src, truncated) = (src.unwrap(), truncated.unwrap());
|
||||
|
||||
// because we've filtered the source with Lsn, we should always have the same keys from both.
|
||||
assert_eq!(src.0, truncated.0);
|
||||
assert_eq!(src.1, truncated.1);
|
||||
|
||||
// if this is needed for something else, just drop this assert.
|
||||
assert!(
|
||||
src.2.pos() >= truncated.2.pos(),
|
||||
"value position should not go backwards {} vs. {}",
|
||||
src.2.pos(),
|
||||
truncated.2.pos()
|
||||
);
|
||||
|
||||
scratch_left.clear();
|
||||
let src_cursor = source_reader.block_cursor();
|
||||
let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
|
||||
scratch_right.clear();
|
||||
let trunc_cursor = truncated_reader.block_cursor();
|
||||
let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
|
||||
|
||||
tokio::try_join!(left, right).unwrap();
|
||||
|
||||
assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -396,6 +396,8 @@ impl ImageLayerInner {
|
||||
// production code path
|
||||
expected_summary.index_start_blk = actual_summary.index_start_blk;
|
||||
expected_summary.index_root_blk = actual_summary.index_root_blk;
|
||||
// mask out the timeline_id, but still require the layers to be from the same tenant
|
||||
expected_summary.timeline_id = actual_summary.timeline_id;
|
||||
|
||||
if actual_summary != expected_summary {
|
||||
bail!(
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt::Write as _;
|
||||
use std::fmt::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::atomic::{AtomicU64, AtomicUsize};
|
||||
@@ -54,6 +54,12 @@ pub struct InMemoryLayer {
|
||||
/// Writes are only allowed when this is `None`.
|
||||
end_lsn: OnceLock<Lsn>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer before frozen.
|
||||
local_path_str: Arc<str>,
|
||||
|
||||
/// Used for traversal path. Cached representation of the in-memory layer after frozen.
|
||||
frozen_local_path_str: OnceLock<Arc<str>>,
|
||||
|
||||
opened_at: Instant,
|
||||
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
@@ -241,6 +247,12 @@ impl InMemoryLayer {
|
||||
self.start_lsn..self.end_lsn_or_max()
|
||||
}
|
||||
|
||||
pub(crate) fn local_path_str(&self) -> &Arc<str> {
|
||||
self.frozen_local_path_str
|
||||
.get()
|
||||
.unwrap_or(&self.local_path_str)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
///
|
||||
/// this is likely completly unused
|
||||
@@ -430,10 +442,24 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
|
||||
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
|
||||
}
|
||||
|
||||
fn inmem_layer_log_display(
|
||||
mut f: impl Write,
|
||||
timeline: TimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
) -> std::fmt::Result {
|
||||
write!(f, "timeline {} in-memory ", timeline)?;
|
||||
inmem_layer_display(f, start_lsn, end_lsn)
|
||||
}
|
||||
|
||||
impl std::fmt::Display for InMemoryLayer {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let end_lsn = self.end_lsn_or_max();
|
||||
write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
|
||||
inmem_layer_display(f, self.start_lsn, end_lsn)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -458,6 +484,12 @@ impl InMemoryLayer {
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
local_path_str: {
|
||||
let mut buf = String::new();
|
||||
inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
|
||||
buf.into()
|
||||
},
|
||||
frozen_local_path_str: OnceLock::new(),
|
||||
conf,
|
||||
timeline_id,
|
||||
tenant_shard_id,
|
||||
@@ -552,6 +584,15 @@ impl InMemoryLayer {
|
||||
);
|
||||
self.end_lsn.set(end_lsn).expect("end_lsn set only once");
|
||||
|
||||
self.frozen_local_path_str
|
||||
.set({
|
||||
let mut buf = String::new();
|
||||
inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
|
||||
.unwrap();
|
||||
buf.into()
|
||||
})
|
||||
.expect("frozen_local_path_str set only once");
|
||||
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
|
||||
@@ -116,6 +116,12 @@ impl AsLayerDesc for Layer {
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Layer {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer {
|
||||
/// Creates a layer value for a file we know to not be resident.
|
||||
pub(crate) fn for_evicted(
|
||||
@@ -389,6 +395,10 @@ impl Layer {
|
||||
&self.0.path
|
||||
}
|
||||
|
||||
pub(crate) fn local_path_str(&self) -> &Arc<str> {
|
||||
&self.0.path_str
|
||||
}
|
||||
|
||||
pub(crate) fn metadata(&self) -> LayerFileMetadata {
|
||||
self.0.metadata()
|
||||
}
|
||||
@@ -511,6 +521,9 @@ struct LayerInner {
|
||||
/// Full path to the file; unclear if this should exist anymore.
|
||||
path: Utf8PathBuf,
|
||||
|
||||
/// String representation of the full path, used for traversal id.
|
||||
path_str: Arc<str>,
|
||||
|
||||
desc: PersistentLayerDesc,
|
||||
|
||||
/// Timeline access is needed for remote timeline client and metrics.
|
||||
@@ -604,9 +617,17 @@ enum Status {
|
||||
|
||||
impl Drop for LayerInner {
|
||||
fn drop(&mut self) {
|
||||
// if there was a pending eviction, mark it cancelled here to balance metrics
|
||||
if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
|
||||
{
|
||||
// eviction has already been started
|
||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||
|
||||
// eviction request is intentionally not honored as no one is present to wait for it
|
||||
// and we could be delaying shutdown for nothing.
|
||||
}
|
||||
|
||||
if !*self.wanted_deleted.get_mut() {
|
||||
// should we try to evict if the last wish was for eviction? seems more like a hazard
|
||||
// than a clear win.
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -708,6 +729,7 @@ impl LayerInner {
|
||||
|
||||
LayerInner {
|
||||
conf,
|
||||
path_str: path.to_string().into(),
|
||||
path,
|
||||
desc,
|
||||
timeline: Arc::downgrade(timeline),
|
||||
@@ -1552,8 +1574,8 @@ impl Drop for DownloadedLayer {
|
||||
if let Some(owner) = self.owner.upgrade() {
|
||||
owner.on_downloaded_layer_drop(self.version);
|
||||
} else {
|
||||
// no need to do anything, we are shutting down
|
||||
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
|
||||
// Layer::drop will handle cancelling the eviction; because of drop order and
|
||||
// `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1752,6 +1774,28 @@ impl ResidentLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: truncate is bad name because we are not truncating anything, but copying the
|
||||
/// filtered parts.
|
||||
#[cfg(test)]
|
||||
pub(super) async fn copy_delta_prefix(
|
||||
&self,
|
||||
writer: &mut super::delta_layer::DeltaLayerWriter,
|
||||
truncate_at: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use LayerKind::*;
|
||||
|
||||
let owner = &self.owner.0;
|
||||
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
Delta(ref d) => d
|
||||
.copy_prefix(writer, truncate_at, ctx)
|
||||
.await
|
||||
.with_context(|| format!("truncate {self}")),
|
||||
Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn local_path(&self) -> &Utf8Path {
|
||||
&self.owner.0.path
|
||||
}
|
||||
@@ -1761,14 +1805,14 @@ impl ResidentLayer {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) async fn get_inner_delta<'a>(
|
||||
&'a self,
|
||||
pub(crate) async fn as_delta(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
|
||||
let owner = &self.owner.0;
|
||||
match self.downloaded.get(owner, ctx).await? {
|
||||
LayerKind::Delta(d) => Ok(d),
|
||||
LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
|
||||
) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
|
||||
use LayerKind::*;
|
||||
match self.downloaded.get(&self.owner.0, ctx).await? {
|
||||
Delta(ref d) => Ok(d),
|
||||
Image(_) => Err(anyhow::anyhow!("image layer")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -721,11 +721,110 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
layer.evict_and_wait(FOREVER).await.unwrap();
|
||||
}
|
||||
|
||||
/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident,
|
||||
/// which is the last value.
|
||||
///
|
||||
/// Also checks that the same does not happen on a non-evicted layer (regression test).
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn eviction_cancellation_on_drop() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
|
||||
let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
|
||||
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
|
||||
let (tenant, ctx) = h.load().await;
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
// create_test_timeline wrote us one layer, write another
|
||||
let mut writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
Key::from_i128(5),
|
||||
Lsn(0x20),
|
||||
&Value::Image(Bytes::from_static(b"this does not matter either")),
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
writer.finish_write(Lsn(0x20));
|
||||
}
|
||||
|
||||
timeline.freeze_and_flush().await.unwrap();
|
||||
|
||||
// wait for the upload to complete so our Arc::strong_count assertion holds
|
||||
timeline
|
||||
.remote_client
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.wait_completion()
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let (evicted_layer, not_evicted) = {
|
||||
let mut layers = {
|
||||
let mut guard = timeline.layers.write().await;
|
||||
let layers = guard.likely_resident_layers().collect::<Vec<_>>();
|
||||
// remove the layers from layermap
|
||||
guard.finish_gc_timeline(&layers);
|
||||
|
||||
layers
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 2);
|
||||
|
||||
(layers.pop().unwrap(), layers.pop().unwrap())
|
||||
};
|
||||
|
||||
let victims = [(evicted_layer, true), (not_evicted, false)];
|
||||
|
||||
for (victim, evict) in victims {
|
||||
let resident = victim.keep_resident().await.unwrap();
|
||||
drop(victim);
|
||||
|
||||
assert_eq!(Arc::strong_count(&resident.owner.0), 1);
|
||||
|
||||
if evict {
|
||||
let evict_and_wait = resident.owner.evict_and_wait(FOREVER);
|
||||
|
||||
// drive the future to await on the status channel, and then drop it
|
||||
tokio::time::timeout(ADVANCE, evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
}
|
||||
|
||||
// 1 == we only evict one of the layers
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
|
||||
drop(resident);
|
||||
|
||||
// run any spawned
|
||||
tokio::time::sleep(ADVANCE).await;
|
||||
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
|
||||
|
||||
assert_eq!(
|
||||
1,
|
||||
LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// A test case to remind you the cost of these structures. You can bump the size limit
|
||||
/// below if it is really necessary to add more fields to the structures.
|
||||
#[test]
|
||||
fn layer_size() {
|
||||
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
|
||||
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
|
||||
assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
|
||||
assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
|
||||
// it also has the utf8 path
|
||||
}
|
||||
|
||||
|
||||
@@ -16,14 +16,14 @@ use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
key::AUX_FILES_KEY,
|
||||
key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
|
||||
keyspace::KeySpaceAccum,
|
||||
models::{
|
||||
CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
|
||||
EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
|
||||
},
|
||||
reltag::BlockNumber,
|
||||
shard::{ShardIdentity, TenantShardId},
|
||||
shard::{ShardIdentity, ShardNumber, TenantShardId},
|
||||
};
|
||||
use rand::Rng;
|
||||
use serde_with::serde_as;
|
||||
@@ -182,6 +182,16 @@ pub(crate) struct AuxFilesState {
|
||||
pub(crate) n_deltas: usize,
|
||||
}
|
||||
|
||||
/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
|
||||
/// ingestion considerably, because WAL ingestion needs to check on most records if the record
|
||||
/// implicitly extends the relation. At startup, `complete_as_of` is initialized to the current end
|
||||
/// of the timeline (disk_consistent_lsn). It's used on reads of relation sizes to check if the
|
||||
/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
|
||||
pub(crate) struct RelSizeCache {
|
||||
pub(crate) complete_as_of: Lsn,
|
||||
pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
|
||||
}
|
||||
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
|
||||
@@ -324,7 +334,7 @@ pub struct Timeline {
|
||||
pub walreceiver: Mutex<Option<WalReceiver>>,
|
||||
|
||||
/// Relation size cache
|
||||
pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
|
||||
pub(crate) rel_size_cache: RwLock<RelSizeCache>,
|
||||
|
||||
download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,
|
||||
|
||||
@@ -428,6 +438,62 @@ pub(crate) enum PageReconstructError {
|
||||
/// An error happened replaying WAL records
|
||||
#[error(transparent)]
|
||||
WalRedo(anyhow::Error),
|
||||
|
||||
#[error("{0}")]
|
||||
MissingKey(MissingKeyError),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MissingKeyError {
|
||||
stuck_at_lsn: bool,
|
||||
key: Key,
|
||||
shard: ShardNumber,
|
||||
cont_lsn: Lsn,
|
||||
request_lsn: Lsn,
|
||||
ancestor_lsn: Option<Lsn>,
|
||||
traversal_path: Vec<TraversalPathItem>,
|
||||
backtrace: Option<std::backtrace::Backtrace>,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for MissingKeyError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if self.stuck_at_lsn {
|
||||
// Records are found in this timeline but no image layer or initial delta record was found.
|
||||
write!(
|
||||
f,
|
||||
"could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
if let Some(ref ancestor_lsn) = self.ancestor_lsn {
|
||||
write!(f, ", ancestor {}", ancestor_lsn)?;
|
||||
}
|
||||
} else {
|
||||
// No records in this timeline.
|
||||
write!(
|
||||
f,
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
|
||||
self.key, self.shard, self.cont_lsn, self.request_lsn
|
||||
)?;
|
||||
}
|
||||
|
||||
if !self.traversal_path.is_empty() {
|
||||
writeln!(f)?;
|
||||
}
|
||||
|
||||
for (r, c, l) in &self.traversal_path {
|
||||
writeln!(
|
||||
f,
|
||||
"layer traversal: result {:?}, cont_lsn {}, layer: {}",
|
||||
r, c, l,
|
||||
)?;
|
||||
}
|
||||
|
||||
if let Some(ref backtrace) = self.backtrace {
|
||||
write!(f, "\n{}", backtrace)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl PageReconstructError {
|
||||
@@ -439,6 +505,7 @@ impl PageReconstructError {
|
||||
AncestorLsnTimeout(_) => false,
|
||||
Cancelled | AncestorStopping(_) => true,
|
||||
WalRedo(_) => false,
|
||||
MissingKey { .. } => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -753,7 +820,7 @@ impl Timeline {
|
||||
writeln!(
|
||||
msg,
|
||||
"- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
|
||||
layer(),
|
||||
layer,
|
||||
)
|
||||
.expect("string grows")
|
||||
});
|
||||
@@ -872,8 +939,16 @@ impl Timeline {
|
||||
Err(Cancelled | AncestorStopping(_)) => {
|
||||
return Err(GetVectoredError::Cancelled)
|
||||
}
|
||||
Err(Other(err)) if err.to_string().contains("could not find data for key") => {
|
||||
return Err(GetVectoredError::MissingKey(key))
|
||||
// we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
|
||||
Err(MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
..
|
||||
})) if !NON_INHERITED_RANGE.contains(&key) => {
|
||||
// The vectored read path handles non inherited keys specially.
|
||||
// If such a a key cannot be reconstructed from the current timeline,
|
||||
// the vectored read path returns a key level error as opposed to a top
|
||||
// level error.
|
||||
return Err(GetVectoredError::MissingKey(key));
|
||||
}
|
||||
_ => {
|
||||
values.insert(key, block);
|
||||
@@ -898,6 +973,7 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
|
||||
let layers_visited = reconstruct_state.get_layers_visited();
|
||||
for (key, res) in reconstruct_state.keys {
|
||||
match res {
|
||||
Err(err) => {
|
||||
@@ -912,6 +988,12 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// Note that this is an approximation. Tracking the exact number of layers visited
|
||||
// per key requires virtually unbounded memory usage and is inefficient
|
||||
// (i.e. segment tree tracking each range queried from a layer)
|
||||
crate::metrics::VEC_READ_NUM_LAYERS_VISITED
|
||||
.observe(layers_visited as f64 / results.len() as f64);
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
@@ -1344,7 +1426,7 @@ impl Timeline {
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
if self.tenant_shard_id.is_zero() {
|
||||
if self.tenant_shard_id.is_shard_zero() {
|
||||
// Logical size is only maintained accurately on shard zero.
|
||||
self.spawn_initial_logical_size_computation_task(ctx);
|
||||
}
|
||||
@@ -1892,7 +1974,10 @@ impl Timeline {
|
||||
last_image_layer_creation_check_at: AtomicLsn::new(0),
|
||||
|
||||
last_received_wal: Mutex::new(None),
|
||||
rel_size_cache: RwLock::new(HashMap::new()),
|
||||
rel_size_cache: RwLock::new(RelSizeCache {
|
||||
complete_as_of: disk_consistent_lsn,
|
||||
map: HashMap::new(),
|
||||
}),
|
||||
|
||||
download_all_remote_layers_task_info: RwLock::new(None),
|
||||
|
||||
@@ -2237,7 +2322,7 @@ impl Timeline {
|
||||
priority: GetLogicalSizePriority,
|
||||
ctx: &RequestContext,
|
||||
) -> logical_size::CurrentLogicalSize {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
// Logical size is only accurately maintained on shard zero: when called elsewhere, for example
|
||||
// when HTTP API is serving a GET for timeline zero, return zero
|
||||
return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
|
||||
@@ -2533,7 +2618,7 @@ impl Timeline {
|
||||
crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// We should never be calculating logical sizes on shard !=0, because these shards do not have
|
||||
// accurate relation sizes, and they do not emit consumption metrics.
|
||||
debug_assert!(self.tenant_shard_id.is_zero());
|
||||
debug_assert!(self.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let guard = self
|
||||
.gate
|
||||
@@ -2692,7 +2777,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalId = String;
|
||||
type TraversalId = Arc<str>;
|
||||
|
||||
trait TraversalLayerExt {
|
||||
fn traversal_id(&self) -> TraversalId;
|
||||
@@ -2700,13 +2785,13 @@ trait TraversalLayerExt {
|
||||
|
||||
impl TraversalLayerExt for Layer {
|
||||
fn traversal_id(&self) -> TraversalId {
|
||||
self.local_path().to_string()
|
||||
Arc::clone(self.local_path_str())
|
||||
}
|
||||
}
|
||||
|
||||
impl TraversalLayerExt for Arc<InMemoryLayer> {
|
||||
fn traversal_id(&self) -> TraversalId {
|
||||
format!("timeline {} in-memory {self}", self.get_timeline_id())
|
||||
Arc::clone(self.local_path_str())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2735,7 +2820,7 @@ impl Timeline {
|
||||
let mut timeline = self;
|
||||
|
||||
let mut read_count = scopeguard::guard(0, |cnt| {
|
||||
crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
|
||||
crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
|
||||
});
|
||||
|
||||
// For debugging purposes, collect the path of layers that we traversed
|
||||
@@ -2775,32 +2860,35 @@ impl Timeline {
|
||||
if prev <= cont_lsn {
|
||||
// Didn't make any progress in last iteration. Error out to avoid
|
||||
// getting stuck in the loop.
|
||||
return Err(layer_traversal_error(format!(
|
||||
"could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
|
||||
return Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: true,
|
||||
key,
|
||||
Lsn(cont_lsn.0 - 1),
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn: Lsn(cont_lsn.0 - 1),
|
||||
request_lsn,
|
||||
timeline.ancestor_lsn
|
||||
), traversal_path));
|
||||
ancestor_lsn: Some(timeline.ancestor_lsn),
|
||||
traversal_path,
|
||||
backtrace: None,
|
||||
}));
|
||||
}
|
||||
}
|
||||
prev_lsn = Some(cont_lsn);
|
||||
}
|
||||
ValueReconstructResult::Missing => {
|
||||
return Err(layer_traversal_error(
|
||||
if cfg!(test) {
|
||||
format!(
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
|
||||
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
|
||||
key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
|
||||
)
|
||||
},
|
||||
return Err(PageReconstructError::MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: None,
|
||||
traversal_path,
|
||||
));
|
||||
backtrace: if cfg!(test) {
|
||||
Some(std::backtrace::Backtrace::force_capture())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2847,12 +2935,8 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || open_layer.traversal_id()),
|
||||
));
|
||||
*read_count += 1;
|
||||
traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
@@ -2878,12 +2962,8 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::from(e)),
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
// metrics: open_layer does not count as fs access, so we are not updating `read_count`
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new(move || frozen_layer.traversal_id()),
|
||||
));
|
||||
*read_count += 1;
|
||||
traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
@@ -2904,14 +2984,7 @@ impl Timeline {
|
||||
};
|
||||
cont_lsn = lsn_floor;
|
||||
*read_count += 1;
|
||||
traversal_path.push((
|
||||
result,
|
||||
cont_lsn,
|
||||
Box::new({
|
||||
let layer = layer.to_owned();
|
||||
move || layer.traversal_id()
|
||||
}),
|
||||
));
|
||||
traversal_path.push((result, cont_lsn, layer.traversal_id()));
|
||||
continue 'outer;
|
||||
} else if timeline.ancestor_timeline.is_some() {
|
||||
// Nothing on this timeline. Traverse to parent
|
||||
@@ -2964,11 +3037,47 @@ impl Timeline {
|
||||
.await?;
|
||||
|
||||
keyspace.remove_overlapping_with(&completed);
|
||||
|
||||
// Do not descend into the ancestor timeline for aux files.
|
||||
// We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
|
||||
// stalling compaction.
|
||||
// TODO(chi): this will need to be updated for aux files v2 storage
|
||||
if keyspace.overlaps(&NON_INHERITED_RANGE) {
|
||||
let removed = keyspace.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![NON_INHERITED_RANGE],
|
||||
});
|
||||
|
||||
for range in removed.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
reconstruct_state.on_key_error(
|
||||
key,
|
||||
PageReconstructError::MissingKey(MissingKeyError {
|
||||
stuck_at_lsn: false,
|
||||
key,
|
||||
shard: self.shard_identity.get_shard_number(&key),
|
||||
cont_lsn,
|
||||
request_lsn,
|
||||
ancestor_lsn: None,
|
||||
traversal_path: Vec::default(),
|
||||
backtrace: if cfg!(test) {
|
||||
Some(std::backtrace::Backtrace::force_capture())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
}),
|
||||
);
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
|
||||
break;
|
||||
}
|
||||
|
||||
cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
|
||||
// Take the min to avoid reconstructing a page with data newer than request Lsn.
|
||||
cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
|
||||
timeline_owned = timeline
|
||||
.get_ready_ancestor_timeline(ctx)
|
||||
.await
|
||||
@@ -3081,6 +3190,8 @@ impl Timeline {
|
||||
|
||||
unmapped_keyspace = keyspace_to_read;
|
||||
cont_lsn = next_cont_lsn;
|
||||
|
||||
reconstruct_state.on_layer_visited();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
@@ -3524,7 +3635,7 @@ impl Timeline {
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
) -> anyhow::Result<()> {
|
||||
// We can only save a valid 'prev_record_lsn' value on disk if we
|
||||
// flushed *all* in-memory changes to disk. We only track
|
||||
// 'prev_record_lsn' in memory for the latest processed record, so we
|
||||
@@ -3541,19 +3652,10 @@ impl Timeline {
|
||||
None
|
||||
};
|
||||
|
||||
let ancestor_timeline_id = self
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(|ancestor| ancestor.timeline_id);
|
||||
|
||||
let metadata = TimelineMetadata::new(
|
||||
let update = crate::tenant::metadata::MetadataUpdate::new(
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
ancestor_timeline_id,
|
||||
self.ancestor_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
self.initdb_lsn,
|
||||
self.pg_version,
|
||||
);
|
||||
|
||||
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
||||
@@ -3565,10 +3667,10 @@ impl Timeline {
|
||||
for layer in layers_to_upload {
|
||||
remote_client.schedule_layer_file_upload(layer)?;
|
||||
}
|
||||
remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
|
||||
remote_client.schedule_index_upload_for_metadata_update(&update)?;
|
||||
}
|
||||
|
||||
Ok(metadata)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
|
||||
@@ -4142,9 +4244,8 @@ impl Timeline {
|
||||
*self.get_latest_gc_cutoff_lsn()
|
||||
}
|
||||
} else {
|
||||
// No time-based retention was configured. Set time-based cutoff to
|
||||
// same as LSN based.
|
||||
cutoff_horizon
|
||||
// No time-based retention was configured. Interpret this as "keep no history".
|
||||
self.get_last_record_lsn()
|
||||
};
|
||||
|
||||
// Grab the lock and update the values
|
||||
@@ -4664,35 +4765,7 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
type TraversalPathItem = (
|
||||
ValueReconstructResult,
|
||||
Lsn,
|
||||
Box<dyn Send + FnOnce() -> TraversalId>,
|
||||
);
|
||||
|
||||
/// Helper function for get_reconstruct_data() to add the path of layers traversed
|
||||
/// to an error, as anyhow context information.
|
||||
fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
|
||||
// We want the original 'msg' to be the outermost context. The outermost context
|
||||
// is the most high-level information, which also gets propagated to the client.
|
||||
let mut msg_iter = path
|
||||
.into_iter()
|
||||
.map(|(r, c, l)| {
|
||||
format!(
|
||||
"layer traversal: result {:?}, cont_lsn {}, layer: {}",
|
||||
r,
|
||||
c,
|
||||
l(),
|
||||
)
|
||||
})
|
||||
.chain(std::iter::once(msg));
|
||||
// Construct initial message from the first traversed layer
|
||||
let err = anyhow!(msg_iter.next().unwrap());
|
||||
|
||||
// Append all subsequent traversals, and the error message 'msg', as contexts.
|
||||
let msg = msg_iter.fold(err, |err, msg| err.context(msg));
|
||||
PageReconstructError::from(msg)
|
||||
}
|
||||
type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);
|
||||
|
||||
struct TimelineWriterState {
|
||||
open_layer: Arc<InMemoryLayer>,
|
||||
|
||||
@@ -378,7 +378,7 @@ impl Timeline {
|
||||
gate: &GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
// Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
|
||||
// for consumption metrics (consumption metrics are only sent from shard 0). We may therefore
|
||||
// skip imitating logical size accesses for eviction purposes.
|
||||
|
||||
@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
|
||||
let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
|
||||
timeline
|
||||
.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||
|
||||
@@ -61,18 +61,18 @@ pub struct VectoredRead {
|
||||
}
|
||||
|
||||
impl VectoredRead {
|
||||
pub fn size(&self) -> usize {
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum VectoredReadExtended {
|
||||
pub(crate) enum VectoredReadExtended {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
struct VectoredReadBuilder {
|
||||
pub(crate) struct VectoredReadBuilder {
|
||||
start: u64,
|
||||
end: u64,
|
||||
blobs_at: VecMap<u64, BlobMeta>,
|
||||
@@ -80,7 +80,17 @@ struct VectoredReadBuilder {
|
||||
}
|
||||
|
||||
impl VectoredReadBuilder {
|
||||
fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
|
||||
/// Start building a new vectored read.
|
||||
///
|
||||
/// Note that by design, this does not check against reading more than `max_read_size` to
|
||||
/// support reading larger blobs than the configuration value. The builder will be single use
|
||||
/// however after that.
|
||||
pub(crate) fn new(
|
||||
start_offset: u64,
|
||||
end_offset: u64,
|
||||
meta: BlobMeta,
|
||||
max_read_size: usize,
|
||||
) -> Self {
|
||||
let mut blobs_at = VecMap::default();
|
||||
blobs_at
|
||||
.append(start_offset, meta)
|
||||
@@ -97,7 +107,8 @@ impl VectoredReadBuilder {
|
||||
/// Attempt to extend the current read with a new blob if the start
|
||||
/// offset matches with the current end of the vectored read
|
||||
/// and the resuting size is below the max read size
|
||||
fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
|
||||
tracing::trace!(start, end, "trying to extend");
|
||||
let size = (end - start) as usize;
|
||||
if self.end == start && self.size() + size <= self.max_read_size {
|
||||
self.end = end;
|
||||
@@ -111,11 +122,11 @@ impl VectoredReadBuilder {
|
||||
VectoredReadExtended::No
|
||||
}
|
||||
|
||||
fn size(&self) -> usize {
|
||||
pub(crate) fn size(&self) -> usize {
|
||||
(self.end - self.start) as usize
|
||||
}
|
||||
|
||||
fn build(self) -> VectoredRead {
|
||||
pub(crate) fn build(self) -> VectoredRead {
|
||||
VectoredRead {
|
||||
start: self.start,
|
||||
end: self.end,
|
||||
|
||||
@@ -403,7 +403,7 @@ impl WalIngest {
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if self.shard.is_zero() {
|
||||
if self.shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
self.observe_decoded_block(modification, blk, ctx).await?;
|
||||
|
||||
@@ -55,6 +55,7 @@ impl NeonWalRecord {
|
||||
/// Does replaying this WAL record initialize the page from scratch, or does
|
||||
/// it need to be applied over the previous image of the page?
|
||||
pub fn will_init(&self) -> bool {
|
||||
// If you change this function, you'll also need to change ValueBytes::will_init
|
||||
match self {
|
||||
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
/// Process lifecycle and abstracction for the IPC protocol.
|
||||
mod process;
|
||||
pub use process::Kind as ProcessKind;
|
||||
|
||||
/// Code to apply [`NeonWalRecord`]s.
|
||||
pub(crate) mod apply_neon;
|
||||
@@ -34,7 +35,7 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -54,7 +55,7 @@ pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
|
||||
/// The current [`process::Process`] that is used by new redo requests.
|
||||
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
|
||||
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// their process object; we use [`Arc::clone`] for that.
|
||||
@@ -66,7 +67,7 @@ pub struct PostgresRedoManager {
|
||||
/// still be using the old redo process. But, those other tasks will most likely
|
||||
/// encounter an error as well, and errors are an unexpected condition anyway.
|
||||
/// So, probably we could get rid of the `Arc` in the future.
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -139,8 +140,8 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
Some(WalRedoManagerStatus {
|
||||
pub fn status(&self) -> WalRedoManagerStatus {
|
||||
WalRedoManagerStatus {
|
||||
last_redo_at: {
|
||||
let at = *self.last_redo_at.lock().unwrap();
|
||||
at.and_then(|at| {
|
||||
@@ -149,8 +150,14 @@ impl PostgresRedoManager {
|
||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||
})
|
||||
},
|
||||
pid: self.redo_process.get().map(|p| p.id()),
|
||||
})
|
||||
process: self
|
||||
.redo_process
|
||||
.get()
|
||||
.map(|p| WalRedoManagerProcessStatus {
|
||||
pid: p.id(),
|
||||
kind: std::borrow::Cow::Borrowed(p.kind().into()),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,37 +215,33 @@ impl PostgresRedoManager {
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let proc: Arc<process::WalRedoProcess> =
|
||||
match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let result = proc
|
||||
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
.context("apply_wal_records");
|
||||
|
||||
let duration = started_at.elapsed();
|
||||
|
||||
@@ -1,186 +1,67 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Write},
|
||||
process::{ChildStdin, ChildStdout, Command, Stdio},
|
||||
sync::{Mutex, MutexGuard},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, nonblock::set_nonblock};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
|
||||
mod no_leak_child;
|
||||
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
|
||||
mod protocol;
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
mod process_impl {
|
||||
pub(super) mod process_async;
|
||||
pub(super) mod process_std;
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
#[derive(
|
||||
Clone,
|
||||
Copy,
|
||||
Debug,
|
||||
PartialEq,
|
||||
Eq,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
strum_macros::IntoStaticStr,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[repr(u8)]
|
||||
pub enum Kind {
|
||||
Sync,
|
||||
Async,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
pub(crate) enum Process {
|
||||
Sync(process_impl::process_std::WalRedoProcess),
|
||||
Async(process_impl::process_async::WalRedoProcess),
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
impl Process {
|
||||
#[inline(always)]
|
||||
pub fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
Ok(match conf.walredo_process_kind {
|
||||
Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
pg_version,
|
||||
)?),
|
||||
Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
pg_version,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) fn apply_wal_records(
|
||||
#[inline(always)]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
@@ -188,221 +69,29 @@ impl WalRedoProcess {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
match self {
|
||||
Process::Sync(p) => {
|
||||
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
Process::Async(p) => {
|
||||
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
match self {
|
||||
Process::Sync(p) => p.id(),
|
||||
Process::Async(p) => p.id(),
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
pub(crate) fn kind(&self) -> Kind {
|
||||
match self {
|
||||
Process::Sync(_) => Kind::Sync,
|
||||
Process::Async(_) => Kind::Async,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
374
pageserver/src/walredo/process/process_impl/process_async.rs
Normal file
374
pageserver/src/walredo/process/process_impl/process_async.rs
Normal file
@@ -0,0 +1,374 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::process::{no_leak_child, protocol},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
process::{Command, Stdio},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, poison::Poison};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
|
||||
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: tokio::process::ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: tokio::process::ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
let stdin =
|
||||
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
|
||||
let stdout = tokio::process::ChildStdout::from_std(stdout)
|
||||
.context("convert to tokio::ChildStdout")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdin",
|
||||
ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
},
|
||||
)),
|
||||
stdout: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdout",
|
||||
ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
/// Apply given WAL records ('records') over an old page image. Returns
|
||||
/// new page image.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation safe.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let Ok(res) =
|
||||
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
};
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
/// branch becomes ready before this future), concurrent and subsequent
|
||||
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
|
||||
/// Dispose of this process instance and create a new one.
|
||||
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
|
||||
let request_no = {
|
||||
let mut lock_guard = self.stdin.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let input = poison_guard.data_mut();
|
||||
input
|
||||
.stdin
|
||||
.write_all(writebuf)
|
||||
.await
|
||||
.context("write to walredo stdin")?;
|
||||
let request_no = input.n_requests;
|
||||
input.n_requests += 1;
|
||||
poison_guard.disarm();
|
||||
request_no
|
||||
};
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut lock_guard = self.stdout.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let output = poison_guard.data_mut();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
output
|
||||
.stdout
|
||||
.read_exact(&mut resultbuf)
|
||||
.await
|
||||
.context("read walredo stdout")?;
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
poison_guard.disarm();
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
use std::io::Write;
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
405
pageserver/src/walredo/process/process_impl/process_std.rs
Normal file
405
pageserver/src/walredo/process/process_impl/process_std.rs
Normal file
@@ -0,0 +1,405 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::process::{no_leak_child, protocol},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Write},
|
||||
process::{ChildStdin, ChildStdout, Command, Stdio},
|
||||
sync::{Mutex, MutexGuard},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
@@ -49,8 +49,6 @@ char *neon_auth_token;
|
||||
int readahead_buffer_size = 128;
|
||||
int flush_every_n_requests = 8;
|
||||
|
||||
int neon_protocol_version;
|
||||
|
||||
static int n_reconnect_attempts = 0;
|
||||
static int max_reconnect_attempts = 60;
|
||||
static int stripe_size;
|
||||
@@ -846,14 +844,6 @@ pg_init_libpagestore(void)
|
||||
PGC_USERSET,
|
||||
0, /* no flags required */
|
||||
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
|
||||
DefineCustomIntVariable("neon.protocol_version",
|
||||
"Version of compute<->page server protocol",
|
||||
NULL,
|
||||
&neon_protocol_version,
|
||||
NEON_PROTOCOL_VERSION, 1, 2,
|
||||
PGC_USERSET,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
relsize_hash_init();
|
||||
|
||||
|
||||
@@ -28,13 +28,6 @@
|
||||
#define MAX_SHARDS 128
|
||||
#define MAX_PAGESERVER_CONNSTRING_SIZE 256
|
||||
|
||||
/*
|
||||
* Currently, the protocol version is not sent to the server.
|
||||
* So it is critical that format of existing commands is not changed.
|
||||
* New protocol versions can just add new commands.
|
||||
*/
|
||||
#define NEON_PROTOCOL_VERSION 2
|
||||
|
||||
typedef enum
|
||||
{
|
||||
/* pagestore_client -> pagestore */
|
||||
@@ -44,12 +37,6 @@ typedef enum
|
||||
T_NeonDbSizeRequest,
|
||||
T_NeonGetSlruSegmentRequest,
|
||||
|
||||
T_NeonExistsV2Request = 10, /* new protocol message tags start from 10 */
|
||||
T_NeonNblocksV2Request,
|
||||
T_NeonGetPageV2Request,
|
||||
T_NeonDbSizeV2Request,
|
||||
T_NeonGetSlruSegmentV2Request,
|
||||
|
||||
/* pagestore -> pagestore_client */
|
||||
T_NeonExistsResponse = 100,
|
||||
T_NeonNblocksResponse,
|
||||
@@ -82,33 +69,18 @@ typedef enum {
|
||||
SLRU_MULTIXACT_OFFSETS
|
||||
} SlruKind;
|
||||
|
||||
/*--
|
||||
* supertype of all the Neon*Request structs below.
|
||||
/*
|
||||
* supertype of all the Neon*Request structs below
|
||||
*
|
||||
* All requests contain two LSNs:
|
||||
*
|
||||
* lsn: request page (or relation size, etc) at this LSN
|
||||
* not_modified_since: Hint that the page hasn't been modified between
|
||||
* this LSN and the request LSN (`lsn`).
|
||||
*
|
||||
* To request the latest version of a page, you can use MAX_LSN as the request
|
||||
* LSN.
|
||||
*
|
||||
* If you don't know any better, you can always set 'not_modified_since' equal
|
||||
* to 'lsn', but providing a lower value can speed up processing the request
|
||||
* in the pageserver, as it doesn't need to wait for the WAL to arrive, and it
|
||||
* can skip traversing through recent layers which we know to not contain any
|
||||
* versions for the requested page.
|
||||
*
|
||||
* These structs describe the V2 of these requests. The old V1 protocol contained
|
||||
* just one LSN and a boolean 'latest' flag. If the neon_protocol_version GUC is
|
||||
* set to 1, we will convert these to the V1 requests before sending.
|
||||
* If 'latest' is true, we are requesting the latest page version, and 'lsn'
|
||||
* is just a hint to the server that we know there are no versions of the page
|
||||
* (or relation size, for exists/nblocks requests) later than the 'lsn'.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
XLogRecPtr lsn;
|
||||
XLogRecPtr not_modified_since;
|
||||
bool latest; /* if true, request latest page version */
|
||||
XLogRecPtr lsn; /* request page version @ this LSN */
|
||||
} NeonRequest;
|
||||
|
||||
typedef struct
|
||||
@@ -221,7 +193,6 @@ extern int readahead_buffer_size;
|
||||
extern char *neon_timeline;
|
||||
extern char *neon_tenant;
|
||||
extern int32 max_cluster_size;
|
||||
extern int neon_protocol_version;
|
||||
|
||||
extern shardno_t get_shard_number(BufferTag* tag);
|
||||
|
||||
@@ -254,14 +225,14 @@ extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
#else
|
||||
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
void *buffer);
|
||||
extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer);
|
||||
extern void neon_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, const void *buffer, bool skipFsync);
|
||||
#endif
|
||||
|
||||
@@ -168,8 +168,8 @@ typedef enum PrefetchStatus
|
||||
typedef struct PrefetchRequest
|
||||
{
|
||||
BufferTag buftag; /* must be first entry in the struct */
|
||||
XLogRecPtr request_lsn;
|
||||
XLogRecPtr not_modified_since;
|
||||
XLogRecPtr effective_request_lsn;
|
||||
XLogRecPtr actual_request_lsn;
|
||||
NeonResponse *response; /* may be null */
|
||||
PrefetchStatus status;
|
||||
shardno_t shard_no;
|
||||
@@ -269,17 +269,19 @@ static PrefetchState *MyPState;
|
||||
) \
|
||||
)
|
||||
|
||||
static XLogRecPtr prefetch_lsn = 0;
|
||||
|
||||
static bool compact_prefetch_buffers(void);
|
||||
static void consume_prefetch_responses(void);
|
||||
static uint64 prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
|
||||
static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
|
||||
static bool prefetch_read(PrefetchRequest *slot);
|
||||
static void prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since);
|
||||
static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
|
||||
static bool prefetch_wait_for(uint64 ring_index);
|
||||
static void prefetch_cleanup_trailing_unused(void);
|
||||
static inline void prefetch_set_unused(uint64 ring_index);
|
||||
|
||||
static void neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since);
|
||||
static XLogRecPtr neon_get_request_lsn(bool *latest, NRelFileInfo rinfo,
|
||||
ForkNumber forknum, BlockNumber blkno);
|
||||
|
||||
static bool
|
||||
compact_prefetch_buffers(void)
|
||||
@@ -336,8 +338,8 @@ compact_prefetch_buffers(void)
|
||||
target_slot->shard_no = source_slot->shard_no;
|
||||
target_slot->status = source_slot->status;
|
||||
target_slot->response = source_slot->response;
|
||||
target_slot->request_lsn = source_slot->request_lsn;
|
||||
target_slot->not_modified_since = source_slot->not_modified_since;
|
||||
target_slot->effective_request_lsn = source_slot->effective_request_lsn;
|
||||
target_slot->actual_request_lsn = source_slot->actual_request_lsn;
|
||||
target_slot->my_ring_index = empty_ring_index;
|
||||
|
||||
prfh_delete(MyPState->prf_hash, source_slot);
|
||||
@@ -356,8 +358,7 @@ compact_prefetch_buffers(void)
|
||||
};
|
||||
source_slot->response = NULL;
|
||||
source_slot->my_ring_index = 0;
|
||||
source_slot->request_lsn = InvalidXLogRecPtr;
|
||||
source_slot->not_modified_since = InvalidXLogRecPtr;
|
||||
source_slot->effective_request_lsn = 0;
|
||||
|
||||
/* update bookkeeping */
|
||||
n_moved++;
|
||||
@@ -683,35 +684,54 @@ prefetch_set_unused(uint64 ring_index)
|
||||
}
|
||||
|
||||
static void
|
||||
prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRecPtr *force_not_modified_since)
|
||||
prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
|
||||
{
|
||||
bool found;
|
||||
NeonGetPageRequest request = {
|
||||
.req.tag = T_NeonGetPageRequest,
|
||||
/* lsn and not_modified_since are filled in below */
|
||||
.req.latest = false,
|
||||
.req.lsn = 0,
|
||||
.rinfo = BufTagGetNRelFileInfo(slot->buftag),
|
||||
.forknum = slot->buftag.forkNum,
|
||||
.blkno = slot->buftag.blockNum,
|
||||
};
|
||||
|
||||
Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
|
||||
|
||||
if (force_request_lsn)
|
||||
if (force_lsn && force_latest)
|
||||
{
|
||||
request.req.lsn = *force_request_lsn;
|
||||
request.req.not_modified_since = *force_not_modified_since;
|
||||
slot->request_lsn = *force_request_lsn;
|
||||
slot->not_modified_since = *force_not_modified_since;
|
||||
request.req.lsn = *force_lsn;
|
||||
request.req.latest = *force_latest;
|
||||
slot->actual_request_lsn = slot->effective_request_lsn = *force_lsn;
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_get_request_lsn(BufTagGetNRelFileInfo(slot->buftag),
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum,
|
||||
&request.req.lsn,
|
||||
&request.req.not_modified_since);
|
||||
slot->request_lsn = request.req.lsn;
|
||||
slot->not_modified_since = request.req.not_modified_since;
|
||||
XLogRecPtr lsn = neon_get_request_lsn(
|
||||
&request.req.latest,
|
||||
BufTagGetNRelFileInfo(slot->buftag),
|
||||
slot->buftag.forkNum,
|
||||
slot->buftag.blockNum
|
||||
);
|
||||
|
||||
/*
|
||||
* Note: effective_request_lsn is potentially higher than the
|
||||
* requested LSN, but still correct:
|
||||
*
|
||||
* We know there are no changes between the actual requested LSN and
|
||||
* the value of effective_request_lsn: If there were, the page would
|
||||
* have been in cache and evicted between those LSN values, which then
|
||||
* would have had to result in a larger request LSN for this page.
|
||||
*
|
||||
* It is possible that a concurrent backend loads the page, modifies
|
||||
* it and then evicts it again, but the LSN of that eviction cannot be
|
||||
* smaller than the current WAL insert/redo pointer, which is already
|
||||
* larger than this prefetch_lsn. So in any case, that would
|
||||
* invalidate this cache.
|
||||
*
|
||||
* The best LSN to use for effective_request_lsn would be
|
||||
* XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
|
||||
*/
|
||||
slot->actual_request_lsn = request.req.lsn = lsn;
|
||||
prefetch_lsn = Max(prefetch_lsn, lsn);
|
||||
slot->effective_request_lsn = prefetch_lsn;
|
||||
}
|
||||
|
||||
Assert(slot->response == NULL);
|
||||
@@ -729,6 +749,7 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
|
||||
/* update slot state */
|
||||
slot->status = PRFS_REQUESTED;
|
||||
|
||||
|
||||
prfh_insert(MyPState->prf_hash, slot, &found);
|
||||
Assert(!found);
|
||||
}
|
||||
@@ -738,25 +759,22 @@ prefetch_do_request(PrefetchRequest *slot, XLogRecPtr *force_request_lsn, XLogRe
|
||||
*
|
||||
* Register that we may want the contents of BufferTag in the near future.
|
||||
*
|
||||
* If force_request_lsn and force_not_modified_since are not NULL, those
|
||||
* values are sent to the pageserver. If they are NULL, we utilize the
|
||||
* lastWrittenLsn -infrastructure to fill them in.
|
||||
* If force_latest and force_lsn are not NULL, those values are sent to the
|
||||
* pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
|
||||
* to fill in these values manually.
|
||||
*
|
||||
* NOTE: this function may indirectly update MyPState->pfs_hash; which
|
||||
* invalidates any active pointers into the hash table.
|
||||
*/
|
||||
|
||||
static uint64
|
||||
prefetch_register_buffer(BufferTag tag, XLogRecPtr *force_request_lsn,
|
||||
XLogRecPtr *force_not_modified_since)
|
||||
prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
|
||||
{
|
||||
uint64 ring_index;
|
||||
PrefetchRequest req;
|
||||
PrefetchRequest *slot;
|
||||
PrfHashEntry *entry;
|
||||
|
||||
Assert(((force_request_lsn != NULL) == (force_not_modified_since != NULL)));
|
||||
|
||||
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
|
||||
req.buftag = tag;
|
||||
Retry:
|
||||
@@ -777,31 +795,37 @@ Retry:
|
||||
* If we want a specific lsn, we do not accept requests that were made
|
||||
* with a potentially different LSN.
|
||||
*/
|
||||
if (force_request_lsn)
|
||||
if (force_latest && force_lsn)
|
||||
{
|
||||
/*
|
||||
* The not_changed_since..request_lsn range of each request is
|
||||
* effectively a claim that the page has not been modified between
|
||||
* whose LSNs. Therefore, if the range of the old request in the
|
||||
* queue overlaps with the new request, we know that the the page
|
||||
* hasn't been modified in the union of the ranges. We can reuse
|
||||
* the old request in that case.
|
||||
*
|
||||
* The new request's LSN should never be older than the old one,
|
||||
* so don't bother checking that case.
|
||||
* if we want the latest version, any effective_request_lsn <
|
||||
* request lsn is OK
|
||||
*/
|
||||
if (*force_request_lsn >= slot->not_modified_since &&
|
||||
*force_not_modified_since <= slot->request_lsn)
|
||||
if (*force_latest)
|
||||
{
|
||||
/* the old request overlaps with the new one; keep it */
|
||||
if (*force_lsn > slot->effective_request_lsn)
|
||||
{
|
||||
if (!prefetch_wait_for(ring_index))
|
||||
goto Retry;
|
||||
prefetch_set_unused(ring_index);
|
||||
entry = NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* if we don't want the latest version, only accept requests with
|
||||
* the exact same LSN
|
||||
*/
|
||||
else
|
||||
{
|
||||
/* Wait for the old request to finish and discard it */
|
||||
if (!prefetch_wait_for(ring_index))
|
||||
goto Retry;
|
||||
prefetch_set_unused(ring_index);
|
||||
entry = NULL;
|
||||
if (*force_lsn != slot->effective_request_lsn)
|
||||
{
|
||||
if (!prefetch_wait_for(ring_index))
|
||||
goto Retry;
|
||||
prefetch_set_unused(ring_index);
|
||||
entry = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -897,7 +921,7 @@ Retry:
|
||||
slot->shard_no = get_shard_number(&tag);
|
||||
slot->my_ring_index = ring_index;
|
||||
|
||||
prefetch_do_request(slot, force_request_lsn, force_not_modified_since);
|
||||
prefetch_do_request(slot, force_latest, force_lsn);
|
||||
Assert(slot->status == PRFS_REQUESTED);
|
||||
Assert(MyPState->ring_last <= ring_index &&
|
||||
ring_index < MyPState->ring_unused);
|
||||
@@ -973,66 +997,7 @@ nm_pack_request(NeonRequest *msg)
|
||||
StringInfoData s;
|
||||
|
||||
initStringInfo(&s);
|
||||
|
||||
if (neon_protocol_version >= 2)
|
||||
{
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
pq_sendint64(&s, msg->lsn);
|
||||
pq_sendint64(&s, msg->not_modified_since);
|
||||
}
|
||||
else
|
||||
{
|
||||
NeonMessageTag tag;
|
||||
bool latest;
|
||||
XLogRecPtr lsn;
|
||||
|
||||
/*
|
||||
* In primary, we always request the latest page version.
|
||||
*/
|
||||
if (!RecoveryInProgress())
|
||||
{
|
||||
latest = true;
|
||||
lsn = msg->not_modified_since;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* In the current protocol, we cannot represent that we want to read
|
||||
* page at LSN X, and we know that it hasn't been modified since Y. We
|
||||
* can either use 'not_modified_lsn' as the request LSN, and risk
|
||||
* getting an error if that LSN is too old and has already fallen out
|
||||
* of the pageserver's GC horizon, or we can send 'request_lsn',
|
||||
* causing the pageserver to possibly wait for the recent WAL to
|
||||
* arrive unnecessarily. Or something in between. We choose to use the
|
||||
* old LSN and risk GC errors, because that's what we've done
|
||||
* historically.
|
||||
*/
|
||||
latest = false;
|
||||
lsn = msg->not_modified_since;
|
||||
}
|
||||
|
||||
switch(msg->tag)
|
||||
{
|
||||
case T_NeonExistsV2Request:
|
||||
tag = T_NeonExistsRequest;
|
||||
break;
|
||||
case T_NeonNblocksV2Request:
|
||||
tag = T_NeonNblocksRequest;
|
||||
break;
|
||||
case T_NeonGetPageV2Request:
|
||||
tag = T_NeonGetPageRequest;
|
||||
break;
|
||||
case T_NeonDbSizeV2Request:
|
||||
tag = T_NeonDbSizeRequest;
|
||||
break;
|
||||
case T_NeonGetSlruSegmentV2Request:
|
||||
tag = T_NeonGetSlruSegmentRequest;
|
||||
break;
|
||||
}
|
||||
pq_sendbyte(&s, tag);
|
||||
pq_sendbyte(&s, latest);
|
||||
pq_sendint64(&s, lsn);
|
||||
}
|
||||
pq_sendbyte(&s, msg->tag);
|
||||
|
||||
switch (messageTag(msg))
|
||||
{
|
||||
@@ -1041,6 +1006,8 @@ nm_pack_request(NeonRequest *msg)
|
||||
{
|
||||
NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
|
||||
pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
|
||||
pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
|
||||
@@ -1052,6 +1019,8 @@ nm_pack_request(NeonRequest *msg)
|
||||
{
|
||||
NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
|
||||
pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
|
||||
pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
|
||||
@@ -1063,6 +1032,8 @@ nm_pack_request(NeonRequest *msg)
|
||||
{
|
||||
NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, msg_req->dbNode);
|
||||
|
||||
break;
|
||||
@@ -1071,6 +1042,8 @@ nm_pack_request(NeonRequest *msg)
|
||||
{
|
||||
NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
|
||||
pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
|
||||
pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
|
||||
@@ -1084,6 +1057,8 @@ nm_pack_request(NeonRequest *msg)
|
||||
{
|
||||
NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
|
||||
|
||||
pq_sendbyte(&s, msg_req->req.latest);
|
||||
pq_sendint64(&s, msg_req->req.lsn);
|
||||
pq_sendbyte(&s, msg_req->kind);
|
||||
pq_sendint32(&s, msg_req->segno);
|
||||
|
||||
@@ -1234,7 +1209,7 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1247,7 +1222,7 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1261,7 +1236,7 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
|
||||
appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1272,7 +1247,7 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
|
||||
appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1284,7 +1259,7 @@ nm_to_string(NeonMessage *msg)
|
||||
appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
|
||||
appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
|
||||
appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.lsn));
|
||||
appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->req.not_modified_since));
|
||||
appendStringInfo(&s, ", \"latest\": %d", msg_req->req.latest);
|
||||
appendStringInfoChar(&s, '}');
|
||||
break;
|
||||
}
|
||||
@@ -1556,36 +1531,44 @@ nm_adjust_lsn(XLogRecPtr lsn)
|
||||
/*
|
||||
* Return LSN for requesting pages and number of blocks from page server
|
||||
*/
|
||||
static void
|
||||
neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
XLogRecPtr *request_lsn, XLogRecPtr *not_modified_since)
|
||||
static XLogRecPtr
|
||||
neon_get_request_lsn(bool *latest, NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno)
|
||||
{
|
||||
XLogRecPtr lsn;
|
||||
|
||||
if (RecoveryInProgress())
|
||||
{
|
||||
/* Request the page at the last replayed LSN. */
|
||||
*request_lsn = GetXLogReplayRecPtr(NULL);
|
||||
*not_modified_since = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
/*
|
||||
* We don't know if WAL has been generated but not yet replayed, so
|
||||
* we're conservative in our estimates about latest pages.
|
||||
*/
|
||||
*latest = false;
|
||||
|
||||
neon_log(DEBUG1, "neon_get_request_lsn request lsn %X/%X, not_modified_since %X/%X",
|
||||
LSN_FORMAT_ARGS(*request_lsn), LSN_FORMAT_ARGS(*not_modified_since));
|
||||
/*
|
||||
* Get the last written LSN of this page.
|
||||
*/
|
||||
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
neon_log(DEBUG1, "neon_get_request_lsn GetXLogReplayRecPtr %X/%X request lsn 0 ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
}
|
||||
else
|
||||
{
|
||||
XLogRecPtr last_written_lsn;
|
||||
XLogRecPtr flushlsn;
|
||||
|
||||
/*
|
||||
* Use the latest LSN that was evicted from the buffer cache as the
|
||||
* 'not_modified_since' hint. Any pages modified by later WAL records
|
||||
* must still in the buffer cache, so our request cannot concern
|
||||
* those.
|
||||
* Use the latest LSN that was evicted from the buffer cache. Any
|
||||
* pages modified by later WAL records must still in the buffer cache,
|
||||
* so our request cannot concern those.
|
||||
*/
|
||||
last_written_lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
Assert(last_written_lsn != InvalidXLogRecPtr);
|
||||
*latest = true;
|
||||
lsn = GetLastWrittenLSN(rinfo, forknum, blkno);
|
||||
Assert(lsn != InvalidXLogRecPtr);
|
||||
neon_log(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
|
||||
LSN_FORMAT_ARGS(last_written_lsn));
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
|
||||
last_written_lsn = nm_adjust_lsn(last_written_lsn);
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
/*
|
||||
* Is it possible that the last-written LSN is ahead of last flush
|
||||
@@ -1600,25 +1583,16 @@ neon_get_request_lsn(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
#else
|
||||
flushlsn = GetFlushRecPtr();
|
||||
#endif
|
||||
if (last_written_lsn > flushlsn)
|
||||
if (lsn > flushlsn)
|
||||
{
|
||||
neon_log(DEBUG5, "last-written LSN %X/%X is ahead of last flushed LSN %X/%X",
|
||||
LSN_FORMAT_ARGS(last_written_lsn),
|
||||
LSN_FORMAT_ARGS(flushlsn));
|
||||
XLogFlush(last_written_lsn);
|
||||
flushlsn = last_written_lsn;
|
||||
(uint32) (lsn >> 32), (uint32) lsn,
|
||||
(uint32) (flushlsn >> 32), (uint32) flushlsn);
|
||||
XLogFlush(lsn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Request the latest version of the page. The most up-to-date request
|
||||
* LSN we could use would be the current insert LSN, but to avoid the
|
||||
* overhead of looking it up, use 'flushlsn' instead. This relies on the
|
||||
* assumption that if the page was modified since the last WAL flush, it
|
||||
* should still be in the buffer cache, and we wouldn't be requesting it.
|
||||
*/
|
||||
*request_lsn = flushlsn;
|
||||
*not_modified_since = last_written_lsn;
|
||||
}
|
||||
|
||||
return lsn;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1630,8 +1604,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
bool exists;
|
||||
NeonResponse *resp;
|
||||
BlockNumber n_blocks;
|
||||
bool latest;
|
||||
XLogRecPtr request_lsn;
|
||||
XLogRecPtr not_modified_since;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -1686,13 +1660,12 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
|
||||
return false;
|
||||
}
|
||||
|
||||
neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO,
|
||||
&request_lsn, ¬_modified_since);
|
||||
request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||
{
|
||||
NeonExistsRequest request = {
|
||||
.req.tag = T_NeonExistsRequest,
|
||||
.req.latest = latest,
|
||||
.req.lsn = request_lsn,
|
||||
.req.not_modified_since = not_modified_since,
|
||||
.rinfo = InfoFromSMgrRel(reln),
|
||||
.forknum = forkNum};
|
||||
|
||||
@@ -2129,10 +2102,10 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
void
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer)
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer)
|
||||
#else
|
||||
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer)
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer)
|
||||
#endif
|
||||
{
|
||||
NeonResponse *resp;
|
||||
@@ -2175,28 +2148,15 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (entry != NULL)
|
||||
{
|
||||
slot = entry->slot;
|
||||
/*
|
||||
* The not_changed_since..request_lsn range of each request is
|
||||
* effectively a claim that the page has not been modified between
|
||||
* those LSNs. Therefore, if the range of the old request in the queue
|
||||
* overlaps with the new request, we know that the the page hasn't
|
||||
* been modified in the union of the ranges. We can reuse the old
|
||||
* request in that case.
|
||||
*
|
||||
* The new request's LSN should never be older than the old one,
|
||||
* so don't bother checking that case.
|
||||
*/
|
||||
if (request_lsn >= slot->not_modified_since &&
|
||||
not_modified_since <= slot->request_lsn)
|
||||
if (slot->effective_request_lsn >= request_lsn)
|
||||
{
|
||||
ring_index = slot->my_ring_index;
|
||||
pgBufferUsage.prefetch.hits += 1;
|
||||
}
|
||||
else
|
||||
else /* the current prefetch LSN is not large
|
||||
* enough, so drop the prefetch */
|
||||
{
|
||||
/*
|
||||
* Cannot use this prefetch, discard it
|
||||
*
|
||||
* We can't drop cache for not-yet-received requested items. It is
|
||||
* unlikely this happens, but it can happen if prefetch distance
|
||||
* is large enough and a backend didn't consume all prefetch
|
||||
@@ -2221,8 +2181,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
{
|
||||
pgBufferUsage.prefetch.misses += 1;
|
||||
|
||||
ring_index = prefetch_register_buffer(buftag, &request_lsn,
|
||||
¬_modified_since);
|
||||
ring_index = prefetch_register_buffer(buftag, &request_latest,
|
||||
&request_lsn);
|
||||
slot = GetPrfSlot(ring_index);
|
||||
}
|
||||
else
|
||||
@@ -2286,8 +2246,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer
|
||||
neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer)
|
||||
#endif
|
||||
{
|
||||
bool latest;
|
||||
XLogRecPtr request_lsn;
|
||||
XLogRecPtr not_modified_since;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -2312,9 +2272,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
|
||||
return;
|
||||
}
|
||||
|
||||
neon_get_request_lsn(InfoFromSMgrRel(reln), forkNum, blkno,
|
||||
&request_lsn, ¬_modified_since);
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, not_modified_since, buffer);
|
||||
request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forkNum, blkno);
|
||||
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsn, latest, buffer);
|
||||
|
||||
#ifdef DEBUG_COMPARE_LOCAL
|
||||
if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
|
||||
@@ -2483,8 +2442,8 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
NeonResponse *resp;
|
||||
BlockNumber n_blocks;
|
||||
bool latest;
|
||||
XLogRecPtr request_lsn;
|
||||
XLogRecPtr not_modified_since;
|
||||
|
||||
switch (reln->smgr_relpersistence)
|
||||
{
|
||||
@@ -2511,13 +2470,12 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
return n_blocks;
|
||||
}
|
||||
|
||||
neon_get_request_lsn(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO,
|
||||
&request_lsn, ¬_modified_since);
|
||||
request_lsn = neon_get_request_lsn(&latest, InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO);
|
||||
{
|
||||
NeonNblocksRequest request = {
|
||||
.req.tag = T_NeonNblocksRequest,
|
||||
.req.latest = latest,
|
||||
.req.lsn = request_lsn,
|
||||
.req.not_modified_since = not_modified_since,
|
||||
.rinfo = InfoFromSMgrRel(reln),
|
||||
.forknum = forknum,
|
||||
};
|
||||
@@ -2565,17 +2523,16 @@ neon_dbsize(Oid dbNode)
|
||||
{
|
||||
NeonResponse *resp;
|
||||
int64 db_size;
|
||||
XLogRecPtr request_lsn,
|
||||
not_modified_since;
|
||||
XLogRecPtr request_lsn;
|
||||
bool latest;
|
||||
NRelFileInfo dummy_node = {0};
|
||||
|
||||
neon_get_request_lsn(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO,
|
||||
&request_lsn, ¬_modified_since);
|
||||
request_lsn = neon_get_request_lsn(&latest, dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO);
|
||||
{
|
||||
NeonDbSizeRequest request = {
|
||||
.req.tag = T_NeonDbSizeRequest,
|
||||
.req.latest = latest,
|
||||
.req.lsn = request_lsn,
|
||||
.req.not_modified_since = not_modified_since,
|
||||
.dbNode = dbNode,
|
||||
};
|
||||
|
||||
@@ -2648,6 +2605,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
* the most recently inserted WAL record's LSN.
|
||||
*/
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
/*
|
||||
@@ -2847,23 +2805,14 @@ neon_end_unlogged_build(SMgrRelation reln)
|
||||
static int
|
||||
neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buffer)
|
||||
{
|
||||
XLogRecPtr request_lsn,
|
||||
not_modified_since;
|
||||
|
||||
if (RecoveryInProgress())
|
||||
request_lsn = GetXLogReplayRecPtr(NULL);
|
||||
else
|
||||
request_lsn = GetXLogInsertRecPtr();
|
||||
request_lsn = nm_adjust_lsn(request_lsn);
|
||||
|
||||
XLogRecPtr request_lsn;
|
||||
/*
|
||||
* GetRedoStartLsn() returns LSN of basebackup. We know that the SLRU
|
||||
* segment has not changed since the basebackup, because in order to
|
||||
* modify it, we would have had to download it already. And once
|
||||
* downloaded, we never evict SLRU segments from local disk.
|
||||
* GetRedoStartLsn() returns LSN of basebackup.
|
||||
* We need to download SLRU segments only once after node startup,
|
||||
* then SLRUs are maintained locally.
|
||||
*/
|
||||
not_modified_since = GetRedoStartLsn();
|
||||
|
||||
request_lsn = GetRedoStartLsn();
|
||||
request_lsn = nm_adjust_lsn(request_lsn);
|
||||
SlruKind kind;
|
||||
|
||||
if (STRPREFIX(path, "pg_xact"))
|
||||
@@ -2878,8 +2827,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
|
||||
NeonResponse *resp;
|
||||
NeonGetSlruSegmentRequest request = {
|
||||
.req.tag = T_NeonGetSlruSegmentRequest,
|
||||
.req.latest = false,
|
||||
.req.lsn = request_lsn,
|
||||
.req.not_modified_since = not_modified_since,
|
||||
|
||||
.kind = kind,
|
||||
.segno = segno
|
||||
@@ -3007,9 +2956,6 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
{
|
||||
BlockNumber relsize;
|
||||
|
||||
/* This is only used in WAL replay */
|
||||
Assert(RecoveryInProgress());
|
||||
|
||||
/* Extend the relation if we know its size */
|
||||
if (get_cached_relsize(rinfo, forknum, &relsize))
|
||||
{
|
||||
@@ -3028,12 +2974,13 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
|
||||
* This length is later reused when we open the smgr to read the
|
||||
* block, which is fine and expected.
|
||||
*/
|
||||
|
||||
NeonResponse *response;
|
||||
NeonNblocksResponse *nbresponse;
|
||||
NeonNblocksRequest request = {
|
||||
.req = (NeonRequest) {
|
||||
.lsn = end_recptr,
|
||||
.not_modified_since = end_recptr,
|
||||
.latest = false,
|
||||
.tag = T_NeonNblocksRequest,
|
||||
},
|
||||
.rinfo = rinfo,
|
||||
|
||||
@@ -48,10 +48,10 @@ PG_FUNCTION_INFO_V1(neon_xlogflush);
|
||||
*/
|
||||
#if PG_MAJORVERSION_NUM < 16
|
||||
typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, char *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
#else
|
||||
typedef void (*neon_read_at_lsn_type) (NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, XLogRecPtr not_modified_since, void *buffer);
|
||||
XLogRecPtr request_lsn, bool request_latest, void *buffer);
|
||||
#endif
|
||||
|
||||
static neon_read_at_lsn_type neon_read_at_lsn_ptr;
|
||||
|
||||
156
poetry.lock
generated
156
poetry.lock
generated
@@ -2,87 +2,87 @@
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp"
|
||||
version = "3.9.2"
|
||||
version = "3.9.4"
|
||||
description = "Async http client/server framework (asyncio)"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
|
||||
{file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
|
||||
{file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
|
||||
{file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
|
||||
{file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
|
||||
{file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
|
||||
{file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
|
||||
{file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
|
||||
{file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
|
||||
{file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
|
||||
{file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
|
||||
{file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
|
||||
{file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@@ -2900,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
|
||||
content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
|
||||
|
||||
@@ -2,8 +2,15 @@ mod classic;
|
||||
mod hacks;
|
||||
mod link;
|
||||
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use ipnet::{Ipv4Net, Ipv6Net};
|
||||
pub use link::LinkAuthError;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
use crate::auth::validate_password_and_exchange;
|
||||
@@ -16,6 +23,7 @@ use crate::intern::EndpointIdInt;
|
||||
use crate::metrics::Metrics;
|
||||
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
|
||||
use crate::stream::Stream;
|
||||
use crate::{
|
||||
auth::{self, ComputeUserInfoMaybeEndpoint},
|
||||
@@ -28,9 +36,6 @@ use crate::{
|
||||
stream, url,
|
||||
};
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
||||
pub enum MaybeOwned<'a, T> {
|
||||
@@ -176,11 +181,45 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
|
||||
pub struct MaskedIp(IpAddr);
|
||||
|
||||
impl MaskedIp {
|
||||
fn new(value: IpAddr, prefix: u8) -> Self {
|
||||
match value {
|
||||
IpAddr::V4(v4) => Self(IpAddr::V4(
|
||||
Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
|
||||
)),
|
||||
IpAddr::V6(v6) => Self(IpAddr::V6(
|
||||
Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This can't be just per IP because that would limit some PaaS that share IP addresses
|
||||
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
|
||||
|
||||
impl RateBucketInfo {
|
||||
/// All of these are per endpoint-maskedip pair.
|
||||
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
|
||||
///
|
||||
/// First bucket: 1000mcpus total per endpoint-ip pair
|
||||
/// * 4096000 requests per second with 1 hash rounds.
|
||||
/// * 1000 requests per second with 4096 hash rounds.
|
||||
/// * 6.8 requests per second with 600000 hash rounds.
|
||||
pub const DEFAULT_AUTH_SET: [Self; 3] = [
|
||||
Self::new(1000 * 4096, Duration::from_secs(1)),
|
||||
Self::new(600 * 4096, Duration::from_secs(60)),
|
||||
Self::new(300 * 4096, Duration::from_secs(600)),
|
||||
];
|
||||
}
|
||||
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
&self,
|
||||
|
||||
ctx: &mut RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
secret: AuthSecret,
|
||||
endpoint: &EndpointId,
|
||||
is_cleartext: bool,
|
||||
@@ -201,9 +240,13 @@ impl AuthenticationConfig {
|
||||
1
|
||||
};
|
||||
|
||||
let limit_not_exceeded = self
|
||||
.rate_limiter
|
||||
.check((endpoint_int, ctx.peer_addr), password_weight);
|
||||
let limit_not_exceeded = self.rate_limiter.check(
|
||||
(
|
||||
endpoint_int,
|
||||
MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
|
||||
),
|
||||
password_weight,
|
||||
);
|
||||
|
||||
if !limit_not_exceeded {
|
||||
warn!(
|
||||
@@ -271,6 +314,7 @@ async fn auth_quirks(
|
||||
let secret = match secret {
|
||||
Some(secret) => config.check_rate_limit(
|
||||
ctx,
|
||||
config,
|
||||
secret,
|
||||
&info.endpoint,
|
||||
unauthenticated_password.is_some() || allow_cleartext,
|
||||
@@ -473,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
use std::{net::IpAddr, sync::Arc, time::Duration};
|
||||
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
@@ -486,7 +530,7 @@ mod tests {
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
use crate::{
|
||||
auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
|
||||
auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
|
||||
config::AuthenticationConfig,
|
||||
console::{
|
||||
self,
|
||||
@@ -495,12 +539,12 @@ mod tests {
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
rate_limiter::RateBucketInfo,
|
||||
scram::ServerSecret,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
use super::auth_quirks;
|
||||
use super::{auth_quirks, AuthRateLimiter};
|
||||
|
||||
struct Auth {
|
||||
ips: Vec<IpPattern>,
|
||||
@@ -541,6 +585,7 @@ mod tests {
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
rate_limiter_enabled: true,
|
||||
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
|
||||
rate_limit_ip_subnet: 64,
|
||||
});
|
||||
|
||||
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
|
||||
@@ -552,6 +597,51 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn masked_ip() {
|
||||
let ip_a = IpAddr::V4([127, 0, 0, 1].into());
|
||||
let ip_b = IpAddr::V4([127, 0, 0, 2].into());
|
||||
let ip_c = IpAddr::V4([192, 168, 1, 101].into());
|
||||
let ip_d = IpAddr::V4([192, 168, 1, 102].into());
|
||||
let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
|
||||
let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
|
||||
|
||||
assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
|
||||
assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
|
||||
assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
|
||||
assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
|
||||
|
||||
assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
|
||||
assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_auth_rate_limit_set() {
|
||||
// these values used to exceed u32::MAX
|
||||
assert_eq!(
|
||||
RateBucketInfo::DEFAULT_AUTH_SET,
|
||||
[
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(1),
|
||||
max_rpi: 1000 * 4096,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(60),
|
||||
max_rpi: 600 * 4096 * 60,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(600),
|
||||
max_rpi: 300 * 4096 * 600,
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
for x in RateBucketInfo::DEFAULT_AUTH_SET {
|
||||
let y = x.to_string().parse().unwrap();
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_scram() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
|
||||
@@ -9,15 +9,13 @@ use futures::future::Either;
|
||||
use itertools::Itertools;
|
||||
use proxy::config::TlsServerEndPoint;
|
||||
use proxy::context::RequestMonitoring;
|
||||
use proxy::proxy::run_until_cancelled;
|
||||
use proxy::{BranchId, EndpointId, ProjectId};
|
||||
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
|
||||
use rustls::pki_types::PrivateKeyDer;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use clap::Arg;
|
||||
use futures::TryFutureExt;
|
||||
use proxy::console::messages::MetricsAuxInfo;
|
||||
use proxy::stream::{PqStream, Stream};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -204,6 +202,7 @@ async fn task_main(
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
|
||||
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
ctx: &mut RequestMonitoring,
|
||||
raw_stream: S,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
@@ -233,7 +232,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
|
||||
Ok(Stream::Tls {
|
||||
tls: Box::new(raw.upgrade(tls_config).await?),
|
||||
tls: Box::new(
|
||||
raw.upgrade(tls_config, !ctx.has_private_peer_addr())
|
||||
.await?,
|
||||
),
|
||||
tls_server_end_point,
|
||||
})
|
||||
}
|
||||
@@ -256,7 +258,7 @@ async fn handle_client(
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
|
||||
let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
|
||||
|
||||
// Cut off first part of the SNI domain
|
||||
// We receive required destination details in the format of
|
||||
@@ -273,18 +275,15 @@ async fn handle_client(
|
||||
|
||||
info!("destination: {}", destination);
|
||||
|
||||
let client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
|
||||
endpoint_id: (&EndpointId::from("")).into(),
|
||||
project_id: (&ProjectId::from("")).into(),
|
||||
branch_id: (&BranchId::from("")).into(),
|
||||
cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
|
||||
};
|
||||
let mut client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
// doesn't yet matter as pg-sni-router doesn't report analytics logs
|
||||
ctx.set_success();
|
||||
ctx.log();
|
||||
|
||||
proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
info!("performing the proxy pass...");
|
||||
let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use aws_config::provider_config::ProviderConfig;
|
||||
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
|
||||
use futures::future::Either;
|
||||
use proxy::auth;
|
||||
use proxy::auth::backend::AuthRateLimiter;
|
||||
use proxy::auth::backend::MaybeOwned;
|
||||
use proxy::cancellation::CancelMap;
|
||||
use proxy::cancellation::CancellationHandler;
|
||||
@@ -20,10 +21,8 @@ use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::http;
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
use proxy::metrics::Metrics;
|
||||
use proxy::rate_limiter::AuthRateLimiter;
|
||||
use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
use proxy::rate_limiter::RateLimiterConfig;
|
||||
use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
@@ -43,6 +42,7 @@ use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use tracing::Instrument;
|
||||
use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
@@ -132,14 +132,8 @@ struct ProxyCliArgs {
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
|
||||
#[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
|
||||
rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
|
||||
/// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
rate_limiter_timeout: tokio::time::Duration,
|
||||
/// Endpoint rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
|
||||
@@ -152,14 +146,12 @@ struct ProxyCliArgs {
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// The IP subnet to use when considering whether two IP addresses are considered the same.
|
||||
#[clap(long, default_value_t = 64)]
|
||||
auth_rate_limit_ip_subnet: u8,
|
||||
/// Redis rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
|
||||
#[clap(long, default_value_t = 100)]
|
||||
initial_limit: usize,
|
||||
#[clap(flatten)]
|
||||
aimd_config: proxy::rate_limiter::AimdConfig,
|
||||
/// cache for `allowed_ips` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
allowed_ips_cache: String,
|
||||
@@ -208,6 +200,12 @@ struct ProxyCliArgs {
|
||||
/// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
|
||||
#[clap(long, default_value = "4194304")]
|
||||
metric_backup_collection_chunk_size: usize,
|
||||
/// Whether to retry the connection to the compute node
|
||||
#[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
|
||||
connect_to_compute_retry: String,
|
||||
/// Whether to retry the wake_compute request
|
||||
#[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
|
||||
wake_compute_retry: String,
|
||||
}
|
||||
|
||||
#[derive(clap::Args, Clone, Copy, Debug)]
|
||||
@@ -339,7 +337,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
let proxy_listener = TcpListener::bind(proxy_address).await?;
|
||||
let cancellation_token = CancellationToken::new();
|
||||
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
|
||||
let cancel_map = CancelMap::default();
|
||||
|
||||
let redis_publisher = match ®ional_redis_client {
|
||||
@@ -365,7 +362,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
config,
|
||||
proxy_listener,
|
||||
cancellation_token.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
cancellation_handler.clone(),
|
||||
));
|
||||
|
||||
@@ -380,7 +376,6 @@ async fn main() -> anyhow::Result<()> {
|
||||
config,
|
||||
serverless_listener,
|
||||
cancellation_token.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
cancellation_handler.clone(),
|
||||
));
|
||||
}
|
||||
@@ -427,7 +422,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
if let Some(regional_redis_client) = regional_redis_client {
|
||||
let cache = api.caches.endpoints_cache.clone();
|
||||
let con = regional_redis_client;
|
||||
maintenance_tasks.spawn(async move { cache.do_read(con).await });
|
||||
let span = tracing::info_span!("endpoints_cache");
|
||||
maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -494,13 +490,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
and metric-collection-interval must be specified"
|
||||
),
|
||||
};
|
||||
let rate_limiter_config = RateLimiterConfig {
|
||||
disable: args.disable_dynamic_rate_limiter,
|
||||
algorithm: args.rate_limit_algorithm,
|
||||
timeout: args.rate_limiter_timeout,
|
||||
initial_limit: args.initial_limit,
|
||||
aimd_config: Some(args.aimd_config),
|
||||
};
|
||||
if !args.disable_dynamic_rate_limiter {
|
||||
bail!("dynamic rate limiter should be disabled");
|
||||
}
|
||||
|
||||
let auth_backend = match &args.auth_backend {
|
||||
AuthBackend::Console => {
|
||||
@@ -542,9 +534,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
tokio::spawn(locks.garbage_collect_worker());
|
||||
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
|
||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||
|
||||
let api = console::provider::neon::Api::new(endpoint, caches, locks);
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
|
||||
let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
|
||||
let api =
|
||||
console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
|
||||
let api = console::provider::ConsoleBackend::Console(api);
|
||||
auth::BackendType::Console(MaybeOwned::Owned(api), ())
|
||||
}
|
||||
@@ -575,10 +571,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
rate_limiter_enabled: args.auth_rate_limit_enabled,
|
||||
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
|
||||
rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
|
||||
};
|
||||
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut endpoint_rps_limit)?;
|
||||
let mut redis_rps_limit = args.redis_rps_limit.clone();
|
||||
RateBucketInfo::validate(&mut redis_rps_limit)?;
|
||||
|
||||
@@ -591,11 +586,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
authentication_config,
|
||||
require_client_ip: args.require_client_ip,
|
||||
disable_ip_check_for_http: args.disable_ip_check_for_http,
|
||||
endpoint_rps_limit,
|
||||
redis_rps_limit,
|
||||
handshake_timeout: args.handshake_timeout,
|
||||
region: args.region.clone(),
|
||||
aws_region: args.aws_region.clone(),
|
||||
wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
|
||||
connect_to_compute_retry_config: config::RetryConfig::parse(
|
||||
&args.connect_to_compute_retry,
|
||||
)?,
|
||||
}));
|
||||
|
||||
Ok(config)
|
||||
|
||||
19
proxy/src/cache/endpoints.rs
vendored
19
proxy/src/cache/endpoints.rs
vendored
@@ -13,6 +13,7 @@ use redis::{
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
config::EndpointCacheConfig,
|
||||
@@ -69,17 +70,14 @@ impl EndpointsCache {
|
||||
if !self.ready.load(Ordering::Acquire) {
|
||||
return true;
|
||||
}
|
||||
// If cache is disabled, just collect the metrics and return.
|
||||
if self.config.disable_cache {
|
||||
ctx.set_rejected(self.should_reject(endpoint));
|
||||
return true;
|
||||
}
|
||||
// If the limiter allows, we don't need to check the cache.
|
||||
if self.limiter.lock().await.check() {
|
||||
return true;
|
||||
}
|
||||
let rejected = self.should_reject(endpoint);
|
||||
ctx.set_rejected(rejected);
|
||||
info!(?rejected, "check endpoint is valid, disabled cache");
|
||||
// If cache is disabled, just collect the metrics and return or
|
||||
// If the limiter allows, we don't need to check the cache.
|
||||
if self.config.disable_cache || self.limiter.lock().await.check() {
|
||||
return true;
|
||||
}
|
||||
!rejected
|
||||
}
|
||||
fn should_reject(&self, endpoint: &EndpointId) -> bool {
|
||||
@@ -171,6 +169,9 @@ impl EndpointsCache {
|
||||
|
||||
if res.keys.is_empty() {
|
||||
if return_when_finish {
|
||||
if total != 0 {
|
||||
break;
|
||||
}
|
||||
anyhow::bail!(
|
||||
"Redis stream {} is empty, cannot be used to filter endpoints",
|
||||
self.config.stream_name
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::{
|
||||
auth,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
rate_limiter::RateBucketInfo,
|
||||
serverless::GlobalConnPoolOptions,
|
||||
};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
@@ -29,11 +29,12 @@ pub struct ProxyConfig {
|
||||
pub authentication_config: AuthenticationConfig,
|
||||
pub require_client_ip: bool,
|
||||
pub disable_ip_check_for_http: bool,
|
||||
pub endpoint_rps_limit: Vec<RateBucketInfo>,
|
||||
pub redis_rps_limit: Vec<RateBucketInfo>,
|
||||
pub region: String,
|
||||
pub handshake_timeout: Duration,
|
||||
pub aws_region: String,
|
||||
pub wake_compute_retry_config: RetryConfig,
|
||||
pub connect_to_compute_retry_config: RetryConfig,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -58,6 +59,7 @@ pub struct AuthenticationConfig {
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub rate_limiter_enabled: bool,
|
||||
pub rate_limiter: AuthRateLimiter,
|
||||
pub rate_limit_ip_subnet: u8,
|
||||
}
|
||||
|
||||
impl TlsConfig {
|
||||
@@ -517,6 +519,59 @@ impl FromStr for ProjectInfoCacheOptions {
|
||||
}
|
||||
}
|
||||
|
||||
/// This is a config for connect to compute and wake compute.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct RetryConfig {
|
||||
/// Number of times we should retry.
|
||||
pub max_retries: u32,
|
||||
/// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
|
||||
pub base_delay: tokio::time::Duration,
|
||||
/// Exponential base for retry wait duration
|
||||
pub backoff_factor: f64,
|
||||
}
|
||||
|
||||
impl RetryConfig {
|
||||
/// Default options for RetryConfig.
|
||||
|
||||
/// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
|
||||
pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
|
||||
"num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
|
||||
/// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
|
||||
/// Cplane has timeout of 60s on each request.
|
||||
pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
|
||||
"num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
|
||||
|
||||
/// Parse retry options passed via cmdline.
|
||||
/// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
|
||||
pub fn parse(options: &str) -> anyhow::Result<Self> {
|
||||
let mut num_retries = None;
|
||||
let mut base_retry_wait_duration = None;
|
||||
let mut retry_wait_exponent_base = None;
|
||||
|
||||
for option in options.split(',') {
|
||||
let (key, value) = option
|
||||
.split_once('=')
|
||||
.with_context(|| format!("bad key-value pair: {option}"))?;
|
||||
|
||||
match key {
|
||||
"num_retries" => num_retries = Some(value.parse()?),
|
||||
"base_retry_wait_duration" => {
|
||||
base_retry_wait_duration = Some(humantime::parse_duration(value)?)
|
||||
}
|
||||
"retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
|
||||
unknown => bail!("unknown key: {unknown}"),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
max_retries: num_retries.context("missing `num_retries`")?,
|
||||
base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
|
||||
backoff_factor: retry_wait_exponent_base
|
||||
.context("missing `retry_wait_exponent_base`")?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper for cmdline cache options parsing.
|
||||
pub struct WakeComputeLockOptions {
|
||||
/// The number of shards the lock map should have
|
||||
|
||||
@@ -208,6 +208,9 @@ pub mod errors {
|
||||
#[error(transparent)]
|
||||
ApiError(ApiError),
|
||||
|
||||
#[error("Too many connections attempts")]
|
||||
TooManyConnections,
|
||||
|
||||
#[error("Timeout waiting to acquire wake compute lock")]
|
||||
TimeoutError,
|
||||
}
|
||||
@@ -240,6 +243,8 @@ pub mod errors {
|
||||
// However, API might return a meaningful error.
|
||||
ApiError(e) => e.to_string_client(),
|
||||
|
||||
TooManyConnections => self.to_string(),
|
||||
|
||||
TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
|
||||
}
|
||||
}
|
||||
@@ -250,6 +255,7 @@ pub mod errors {
|
||||
match self {
|
||||
WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
|
||||
WakeComputeError::ApiError(e) => e.get_error_kind(),
|
||||
WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
|
||||
WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ use crate::{
|
||||
console::messages::ColdStartInfo,
|
||||
http,
|
||||
metrics::{CacheOutcome, Metrics},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
scram, Normalize,
|
||||
};
|
||||
use crate::{cache::Cached, context::RequestMonitoring};
|
||||
@@ -25,6 +26,7 @@ pub struct Api {
|
||||
endpoint: http::Endpoint,
|
||||
pub caches: &'static ApiCaches,
|
||||
pub locks: &'static ApiLocks,
|
||||
pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
jwt: String,
|
||||
}
|
||||
|
||||
@@ -34,6 +36,7 @@ impl Api {
|
||||
endpoint: http::Endpoint,
|
||||
caches: &'static ApiCaches,
|
||||
locks: &'static ApiLocks,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
) -> Self {
|
||||
let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
|
||||
Ok(v) => v,
|
||||
@@ -43,6 +46,7 @@ impl Api {
|
||||
endpoint,
|
||||
caches,
|
||||
locks,
|
||||
endpoint_rate_limiter,
|
||||
jwt,
|
||||
}
|
||||
}
|
||||
@@ -277,6 +281,14 @@ impl super::Api for Api {
|
||||
return Ok(cached);
|
||||
}
|
||||
|
||||
// check rate limit
|
||||
if !self
|
||||
.endpoint_rate_limiter
|
||||
.check(user_info.endpoint.normalize().into(), 1)
|
||||
{
|
||||
return Err(WakeComputeError::TooManyConnections);
|
||||
}
|
||||
|
||||
let permit = self.locks.get_wake_compute_permit(&key).await?;
|
||||
|
||||
// after getting back a permit - it's possible the cache was filled
|
||||
|
||||
@@ -5,7 +5,7 @@ use once_cell::sync::OnceCell;
|
||||
use smol_str::SmolStr;
|
||||
use std::net::IpAddr;
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{field::display, info_span, Span};
|
||||
use tracing::{field::display, info, info_span, Span};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
@@ -51,7 +51,7 @@ pub struct RequestMonitoring {
|
||||
sender: Option<mpsc::UnboundedSender<RequestData>>,
|
||||
pub latency_timer: LatencyTimer,
|
||||
// Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
|
||||
rejected: bool,
|
||||
rejected: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
@@ -76,6 +76,7 @@ impl RequestMonitoring {
|
||||
?session_id,
|
||||
%peer_addr,
|
||||
ep = tracing::field::Empty,
|
||||
role = tracing::field::Empty,
|
||||
);
|
||||
|
||||
Self {
|
||||
@@ -95,7 +96,7 @@ impl RequestMonitoring {
|
||||
error_kind: None,
|
||||
auth_method: None,
|
||||
success: false,
|
||||
rejected: false,
|
||||
rejected: None,
|
||||
cold_start_info: ColdStartInfo::Unknown,
|
||||
|
||||
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
|
||||
@@ -117,7 +118,7 @@ impl RequestMonitoring {
|
||||
}
|
||||
|
||||
pub fn set_rejected(&mut self, rejected: bool) {
|
||||
self.rejected = rejected;
|
||||
self.rejected = Some(rejected);
|
||||
}
|
||||
|
||||
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
|
||||
@@ -157,6 +158,7 @@ impl RequestMonitoring {
|
||||
}
|
||||
|
||||
pub fn set_user(&mut self, user: RoleName) {
|
||||
self.span.record("role", display(&user));
|
||||
self.user = Some(user);
|
||||
}
|
||||
|
||||
@@ -164,8 +166,18 @@ impl RequestMonitoring {
|
||||
self.auth_method = Some(auth_method);
|
||||
}
|
||||
|
||||
pub fn has_private_peer_addr(&self) -> bool {
|
||||
match self.peer_addr {
|
||||
IpAddr::V4(ip) => ip.is_private(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_error_kind(&mut self, kind: ErrorKind) {
|
||||
Metrics::get().proxy.errors_total.inc(kind);
|
||||
// Do not record errors from the private address to metrics.
|
||||
if !self.has_private_peer_addr() {
|
||||
Metrics::get().proxy.errors_total.inc(kind);
|
||||
}
|
||||
if let Some(ep) = &self.endpoint_id {
|
||||
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
|
||||
let label = metric.with_labels(kind);
|
||||
@@ -188,14 +200,28 @@ impl Drop for RequestMonitoring {
|
||||
} else {
|
||||
ConnectOutcome::Failed
|
||||
};
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.invalid_endpoints_total
|
||||
.inc(InvalidEndpointsGroup {
|
||||
protocol: self.protocol,
|
||||
rejected: self.rejected.into(),
|
||||
outcome,
|
||||
});
|
||||
if let Some(rejected) = self.rejected {
|
||||
let ep = self
|
||||
.endpoint_id
|
||||
.as_ref()
|
||||
.map(|x| x.as_str())
|
||||
.unwrap_or_default();
|
||||
// This makes sense only if cache is disabled
|
||||
info!(
|
||||
?outcome,
|
||||
?rejected,
|
||||
?ep,
|
||||
"check endpoint is valid with outcome"
|
||||
);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.invalid_endpoints_total
|
||||
.inc(InvalidEndpointsGroup {
|
||||
protocol: self.protocol,
|
||||
rejected: rejected.into(),
|
||||
outcome,
|
||||
});
|
||||
}
|
||||
if let Some(tx) = self.sender.take() {
|
||||
let _: Result<(), _> = tx.send(RequestData::from(&*self));
|
||||
}
|
||||
|
||||
@@ -15,7 +15,6 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
metrics::{ConsoleRequest, Metrics},
|
||||
rate_limiter,
|
||||
url::ApiUrl,
|
||||
};
|
||||
use reqwest_middleware::RequestBuilder;
|
||||
@@ -23,7 +22,7 @@ use reqwest_middleware::RequestBuilder;
|
||||
/// This is the preferred way to create new http clients,
|
||||
/// because it takes care of observability (OpenTelemetry).
|
||||
/// We deliberately don't want to replace this with a public static.
|
||||
pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
|
||||
pub fn new_client() -> ClientWithMiddleware {
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
@@ -32,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien
|
||||
|
||||
reqwest_middleware::ClientBuilder::new(client)
|
||||
.with(reqwest_tracing::TracingMiddleware::default())
|
||||
.with(rate_limiter::Limiter::new(rate_limiter_config))
|
||||
.build()
|
||||
}
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@ use lasso::ThreadedRodeo;
|
||||
use measured::{
|
||||
label::StaticLabelSet,
|
||||
metric::{histogram::Thresholds, name::MetricName},
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
LabelGroup, MetricGroup,
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
|
||||
MetricGroup,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
|
||||
|
||||
@@ -20,9 +20,6 @@ pub struct Metrics {
|
||||
|
||||
#[metric(namespace = "wake_compute_lock")]
|
||||
pub wake_compute_lock: ApiLockMetrics,
|
||||
|
||||
// the one metric not called proxy_....
|
||||
pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
@@ -31,7 +28,6 @@ impl Metrics {
|
||||
SELF.get_or_init(|| Metrics {
|
||||
proxy: ProxyMetrics::default(),
|
||||
wake_compute_lock: ApiLockMetrics::new(),
|
||||
semaphore_control_plane_limit: GaugeVec::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -123,6 +119,10 @@ pub struct ProxyMetrics {
|
||||
|
||||
/// Number of invalid endpoints (per protocol, per rejected).
|
||||
pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
|
||||
|
||||
/// Number of retries (per outcome, per retry_type).
|
||||
#[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
|
||||
pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
|
||||
}
|
||||
|
||||
#[derive(MetricGroup)]
|
||||
@@ -286,13 +286,6 @@ pub enum LatencyExclusions {
|
||||
ClientAndCplane,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "limit")]
|
||||
pub enum RateLimit {
|
||||
Actual,
|
||||
Expected,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "kind")]
|
||||
pub enum SniKind {
|
||||
@@ -491,3 +484,16 @@ pub struct InvalidEndpointsGroup {
|
||||
pub rejected: Bool,
|
||||
pub outcome: ConnectOutcome,
|
||||
}
|
||||
|
||||
#[derive(LabelGroup)]
|
||||
#[label(set = RetriesMetricSet)]
|
||||
pub struct RetriesMetricGroup {
|
||||
pub outcome: ConnectOutcome,
|
||||
pub retry_type: RetryType,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
|
||||
pub enum RetryType {
|
||||
WakeCompute,
|
||||
ConnectToCompute,
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ pub mod handshake;
|
||||
pub mod passthrough;
|
||||
pub mod retry;
|
||||
pub mod wake_compute;
|
||||
pub use copy_bidirectional::copy_bidirectional_client_compute;
|
||||
|
||||
use crate::{
|
||||
auth,
|
||||
@@ -18,9 +19,8 @@ use crate::{
|
||||
metrics::{Metrics, NumClientConnectionsGuard},
|
||||
protocol2::WithClientIp,
|
||||
proxy::handshake::{handshake, HandshakeData},
|
||||
rate_limiter::EndpointRateLimiter,
|
||||
stream::{PqStream, Stream},
|
||||
EndpointCacheKey, Normalize,
|
||||
EndpointCacheKey,
|
||||
};
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
@@ -60,7 +60,6 @@ pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
listener: tokio::net::TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
@@ -85,7 +84,6 @@ pub async fn task_main(
|
||||
|
||||
let session_id = uuid::Uuid::new_v4();
|
||||
let cancellation_handler = Arc::clone(&cancellation_handler);
|
||||
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
|
||||
|
||||
tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
|
||||
|
||||
@@ -127,7 +125,6 @@ pub async fn task_main(
|
||||
cancellation_handler,
|
||||
socket,
|
||||
ClientMode::Tcp,
|
||||
endpoint_rate_limiter,
|
||||
conn_gauge,
|
||||
)
|
||||
.instrument(span.clone())
|
||||
@@ -241,7 +238,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
stream: S,
|
||||
mode: ClientMode,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
conn_gauge: NumClientConnectionsGuard<'static>,
|
||||
) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
|
||||
info!(
|
||||
@@ -256,8 +252,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
|
||||
let record_handshake_error = !ctx.has_private_peer_addr();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls));
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
@@ -286,15 +283,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
Err(e) => stream.throw_error(e).await?,
|
||||
};
|
||||
|
||||
// check rate limit
|
||||
if let Some(ep) = user_info.get_endpoint() {
|
||||
if !endpoint_rate_limiter.check(ep.normalize(), 1) {
|
||||
return stream
|
||||
.throw_error(auth::AuthError::too_many_connections())
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
|
||||
let user = user_info.get_user().to_owned();
|
||||
let user_info = match user_info
|
||||
.authenticate(
|
||||
@@ -320,6 +308,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
&TcpMechanism { params: ¶ms },
|
||||
&user_info,
|
||||
mode.allow_self_signed_compute(config),
|
||||
config.wake_compute_retry_config,
|
||||
config.connect_to_compute_retry_config,
|
||||
)
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use crate::{
|
||||
auth::backend::ComputeCredentialKeys,
|
||||
compute::{self, PostgresConnection},
|
||||
config::RetryConfig,
|
||||
console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
|
||||
context::RequestMonitoring,
|
||||
error::ReportableError,
|
||||
metrics::{ConnectionFailureKind, Metrics},
|
||||
metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
|
||||
proxy::{
|
||||
retry::{retry_after, ShouldRetry},
|
||||
wake_compute::wake_compute,
|
||||
@@ -93,19 +94,23 @@ pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
|
||||
mechanism: &M,
|
||||
user_info: &B,
|
||||
allow_self_signed_compute: bool,
|
||||
wake_compute_retry_config: RetryConfig,
|
||||
connect_to_compute_retry_config: RetryConfig,
|
||||
) -> Result<M::Connection, M::Error>
|
||||
where
|
||||
M::ConnectError: ShouldRetry + std::fmt::Debug,
|
||||
M::Error: From<WakeComputeError>,
|
||||
{
|
||||
let mut num_retries = 0;
|
||||
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
|
||||
let mut node_info =
|
||||
wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
|
||||
if let Some(keys) = user_info.get_keys() {
|
||||
node_info.set_keys(keys);
|
||||
}
|
||||
node_info.allow_self_signed_compute = allow_self_signed_compute;
|
||||
// let mut node_info = credentials.get_node_info(ctx, user_info).await?;
|
||||
mechanism.update_connect_config(&mut node_info.config);
|
||||
let retry_type = RetryType::ConnectToCompute;
|
||||
|
||||
// try once
|
||||
let err = match mechanism
|
||||
@@ -114,6 +119,13 @@ where
|
||||
{
|
||||
Ok(res) => {
|
||||
ctx.latency_timer.success();
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Success,
|
||||
retry_type,
|
||||
},
|
||||
num_retries.into(),
|
||||
);
|
||||
return Ok(res);
|
||||
}
|
||||
Err(e) => e,
|
||||
@@ -124,7 +136,7 @@ where
|
||||
let node_info = if !node_info.cached() {
|
||||
// If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
|
||||
// Do not need to retrieve a new node_info, just return the old one.
|
||||
if !err.should_retry(num_retries) {
|
||||
if !err.should_retry(num_retries, connect_to_compute_retry_config) {
|
||||
return Err(err.into());
|
||||
}
|
||||
node_info
|
||||
@@ -132,7 +144,8 @@ where
|
||||
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
|
||||
info!("compute node's state has likely changed; requesting a wake-up");
|
||||
let old_node_info = invalidate_cache(node_info);
|
||||
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
|
||||
let mut node_info =
|
||||
wake_compute(&mut num_retries, ctx, user_info, wake_compute_retry_config).await?;
|
||||
node_info.reuse_settings(old_node_info);
|
||||
|
||||
mechanism.update_connect_config(&mut node_info.config);
|
||||
@@ -151,19 +164,34 @@ where
|
||||
{
|
||||
Ok(res) => {
|
||||
ctx.latency_timer.success();
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Success,
|
||||
retry_type,
|
||||
},
|
||||
num_retries.into(),
|
||||
);
|
||||
info!(?num_retries, "connected to compute node after");
|
||||
return Ok(res);
|
||||
}
|
||||
Err(e) => {
|
||||
let retriable = e.should_retry(num_retries);
|
||||
let retriable = e.should_retry(num_retries, connect_to_compute_retry_config);
|
||||
if !retriable {
|
||||
error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Failed,
|
||||
retry_type,
|
||||
},
|
||||
num_retries.into(),
|
||||
);
|
||||
return Err(e.into());
|
||||
}
|
||||
warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
|
||||
}
|
||||
}
|
||||
|
||||
let wait_duration = retry_after(num_retries);
|
||||
let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
|
||||
num_retries += 1;
|
||||
|
||||
time::sleep(wait_duration).await;
|
||||
|
||||
@@ -41,7 +41,7 @@ where
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
|
||||
pub async fn copy_bidirectional_client_compute<Client, Compute>(
|
||||
client: &mut Client,
|
||||
compute: &mut Compute,
|
||||
) -> Result<(u64, u64), std::io::Error>
|
||||
|
||||
@@ -63,6 +63,7 @@ pub enum HandshakeData<S> {
|
||||
pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
stream: S,
|
||||
mut tls: Option<&TlsConfig>,
|
||||
record_handshake_error: bool,
|
||||
) -> Result<HandshakeData<S>, HandshakeError> {
|
||||
// Client may try upgrading to each protocol only once
|
||||
let (mut tried_ssl, mut tried_gss) = (false, false);
|
||||
@@ -95,7 +96,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
if !read_buf.is_empty() {
|
||||
return Err(HandshakeError::EarlyData);
|
||||
}
|
||||
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
|
||||
let tls_stream = raw
|
||||
.upgrade(tls.to_server_config(), record_handshake_error)
|
||||
.await?;
|
||||
|
||||
let (_, tls_server_end_point) = tls
|
||||
.cert_resolver
|
||||
|
||||
@@ -1,18 +1,12 @@
|
||||
use crate::compute;
|
||||
use crate::{compute, config::RetryConfig};
|
||||
use std::{error::Error, io};
|
||||
use tokio::time;
|
||||
|
||||
/// Number of times we should retry the `/proxy_wake_compute` http request.
|
||||
/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0
|
||||
pub const NUM_RETRIES_CONNECT: u32 = 16;
|
||||
const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25);
|
||||
const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2;
|
||||
|
||||
pub trait ShouldRetry {
|
||||
fn could_retry(&self) -> bool;
|
||||
fn should_retry(&self, num_retries: u32) -> bool {
|
||||
fn should_retry(&self, num_retries: u32, config: RetryConfig) -> bool {
|
||||
match self {
|
||||
_ if num_retries >= NUM_RETRIES_CONNECT => false,
|
||||
_ if num_retries >= config.max_retries => false,
|
||||
err => err.could_retry(),
|
||||
}
|
||||
}
|
||||
@@ -63,6 +57,8 @@ impl ShouldRetry for compute::ConnectionError {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn retry_after(num_retries: u32) -> time::Duration {
|
||||
BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1))
|
||||
pub fn retry_after(num_retries: u32, config: RetryConfig) -> time::Duration {
|
||||
config
|
||||
.base_delay
|
||||
.mul_f64(config.backoff_factor.powi((num_retries as i32) - 1))
|
||||
}
|
||||
|
||||
@@ -10,13 +10,13 @@ use super::*;
|
||||
use crate::auth::backend::{
|
||||
ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
|
||||
};
|
||||
use crate::config::CertResolver;
|
||||
use crate::config::{CertResolver, RetryConfig};
|
||||
use crate::console::caches::NodeInfoCache;
|
||||
use crate::console::messages::MetricsAuxInfo;
|
||||
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
|
||||
use crate::proxy::retry::retry_after;
|
||||
use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
|
||||
use anyhow::{bail, Context};
|
||||
use async_trait::async_trait;
|
||||
@@ -175,7 +175,7 @@ async fn dummy_proxy(
|
||||
auth: impl TestAuth + Send,
|
||||
) -> anyhow::Result<()> {
|
||||
let client = WithClientIp::new(client);
|
||||
let mut stream = match handshake(client, tls.as_ref()).await? {
|
||||
let mut stream = match handshake(client, tls.as_ref(), false).await? {
|
||||
HandshakeData::Startup(stream, _) => stream,
|
||||
HandshakeData::Cancel(_) => bail!("cancellation not supported"),
|
||||
};
|
||||
@@ -361,11 +361,15 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
|
||||
#[test]
|
||||
fn connect_compute_total_wait() {
|
||||
let mut total_wait = tokio::time::Duration::ZERO;
|
||||
for num_retries in 1..NUM_RETRIES_CONNECT {
|
||||
total_wait += retry_after(num_retries);
|
||||
let config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
for num_retries in 1..config.max_retries {
|
||||
total_wait += retry_after(num_retries, config);
|
||||
}
|
||||
assert!(total_wait < tokio::time::Duration::from_secs(12));
|
||||
assert!(total_wait > tokio::time::Duration::from_secs(10));
|
||||
assert!(f64::abs(total_wait.as_secs_f64() - 15.0) < 0.1);
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
@@ -549,7 +553,12 @@ async fn connect_to_compute_success() {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
let config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
@@ -562,7 +571,12 @@ async fn connect_to_compute_retry() {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
let config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
@@ -576,7 +590,12 @@ async fn connect_to_compute_non_retry_1() {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
let config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
@@ -590,7 +609,12 @@ async fn connect_to_compute_non_retry_2() {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
let config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
@@ -600,17 +624,32 @@ async fn connect_to_compute_non_retry_2() {
|
||||
#[tokio::test]
|
||||
async fn connect_to_compute_non_retry_3() {
|
||||
let _ = env_logger::try_init();
|
||||
assert_eq!(NUM_RETRIES_CONNECT, 16);
|
||||
tokio::time::pause();
|
||||
use ConnectAction::*;
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![
|
||||
Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
|
||||
Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
|
||||
]);
|
||||
let mechanism =
|
||||
TestConnectMechanism::new(vec![Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
.await
|
||||
.unwrap_err();
|
||||
let wake_compute_retry_config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 1,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
let connect_to_compute_retry_config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(
|
||||
&mut ctx,
|
||||
&mechanism,
|
||||
&user_info,
|
||||
false,
|
||||
wake_compute_retry_config,
|
||||
connect_to_compute_retry_config,
|
||||
)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
}
|
||||
|
||||
@@ -622,7 +661,12 @@ async fn wake_retry() {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
let config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap();
|
||||
mechanism.verify();
|
||||
@@ -636,7 +680,12 @@ async fn wake_non_retry() {
|
||||
let mut ctx = RequestMonitoring::test();
|
||||
let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
|
||||
let user_info = helper_create_connect_info(&mechanism);
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false)
|
||||
let config = RetryConfig {
|
||||
base_delay: Duration::from_secs(1),
|
||||
max_retries: 5,
|
||||
backoff_factor: 2.0,
|
||||
};
|
||||
connect_to_compute(&mut ctx, &mechanism, &user_info, false, config, config)
|
||||
.await
|
||||
.unwrap_err();
|
||||
mechanism.verify();
|
||||
|
||||
@@ -34,7 +34,10 @@ async fn proxy_mitm(
|
||||
tokio::spawn(async move {
|
||||
// begin handshake with end_server
|
||||
let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
|
||||
let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
|
||||
let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(_) => panic!("cancellation not supported"),
|
||||
};
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
use crate::config::RetryConfig;
|
||||
use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
|
||||
use crate::context::RequestMonitoring;
|
||||
use crate::metrics::{ConnectionFailuresBreakdownGroup, Metrics, WakeupFailureKind};
|
||||
use crate::metrics::{
|
||||
ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
|
||||
WakeupFailureKind,
|
||||
};
|
||||
use crate::proxy::retry::retry_after;
|
||||
use hyper::StatusCode;
|
||||
use std::ops::ControlFlow;
|
||||
use tracing::{error, warn};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
use super::connect_compute::ComputeConnectBackend;
|
||||
use super::retry::ShouldRetry;
|
||||
@@ -13,23 +17,42 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
|
||||
num_retries: &mut u32,
|
||||
ctx: &mut RequestMonitoring,
|
||||
api: &B,
|
||||
config: RetryConfig,
|
||||
) -> Result<CachedNodeInfo, WakeComputeError> {
|
||||
let retry_type = RetryType::WakeCompute;
|
||||
loop {
|
||||
let wake_res = api.wake_compute(ctx).await;
|
||||
match handle_try_wake(wake_res, *num_retries) {
|
||||
match handle_try_wake(wake_res, *num_retries, config) {
|
||||
Err(e) => {
|
||||
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
|
||||
report_error(&e, false);
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Failed,
|
||||
retry_type,
|
||||
},
|
||||
(*num_retries).into(),
|
||||
);
|
||||
return Err(e);
|
||||
}
|
||||
Ok(ControlFlow::Continue(e)) => {
|
||||
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
|
||||
report_error(&e, true);
|
||||
}
|
||||
Ok(ControlFlow::Break(n)) => return Ok(n),
|
||||
Ok(ControlFlow::Break(n)) => {
|
||||
Metrics::get().proxy.retries_metric.observe(
|
||||
RetriesMetricGroup {
|
||||
outcome: ConnectOutcome::Success,
|
||||
retry_type,
|
||||
},
|
||||
(*num_retries).into(),
|
||||
);
|
||||
info!(?num_retries, "compute node woken up after");
|
||||
return Ok(n);
|
||||
}
|
||||
}
|
||||
|
||||
let wait_duration = retry_after(*num_retries);
|
||||
let wait_duration = retry_after(*num_retries, config);
|
||||
*num_retries += 1;
|
||||
tokio::time::sleep(wait_duration).await;
|
||||
}
|
||||
@@ -42,10 +65,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
|
||||
pub fn handle_try_wake(
|
||||
result: Result<CachedNodeInfo, WakeComputeError>,
|
||||
num_retries: u32,
|
||||
config: RetryConfig,
|
||||
) -> Result<ControlFlow<CachedNodeInfo, WakeComputeError>, WakeComputeError> {
|
||||
match result {
|
||||
Err(err) => match &err {
|
||||
WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
|
||||
WakeComputeError::ApiError(api) if api.should_retry(num_retries, config) => {
|
||||
Ok(ControlFlow::Continue(err))
|
||||
}
|
||||
_ => Err(err),
|
||||
@@ -90,6 +114,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
|
||||
WakeComputeError::ApiError(ApiError::Console { .. }) => {
|
||||
WakeupFailureKind::ApiConsoleOtherError
|
||||
}
|
||||
WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
|
||||
WakeComputeError::TimeoutError => WakeupFailureKind::TimeoutError,
|
||||
};
|
||||
Metrics::get()
|
||||
|
||||
@@ -1,7 +1,2 @@
|
||||
mod aimd;
|
||||
mod limit_algorithm;
|
||||
mod limiter;
|
||||
pub use aimd::Aimd;
|
||||
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
|
||||
pub use limiter::Limiter;
|
||||
pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
|
||||
pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
use std::usize;
|
||||
|
||||
use async_trait::async_trait;
|
||||
|
||||
use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample};
|
||||
|
||||
use super::limiter::Outcome;
|
||||
|
||||
/// Loss-based congestion avoidance.
|
||||
///
|
||||
/// Additive-increase, multiplicative decrease.
|
||||
///
|
||||
/// Adds available currency when:
|
||||
/// 1. no load-based errors are observed, and
|
||||
/// 2. the utilisation of the current limit is high.
|
||||
///
|
||||
/// Reduces available concurrency by a factor when load-based errors are detected.
|
||||
pub struct Aimd {
|
||||
min_limit: usize,
|
||||
max_limit: usize,
|
||||
decrease_factor: f32,
|
||||
increase_by: usize,
|
||||
min_utilisation_threshold: f32,
|
||||
}
|
||||
|
||||
impl Aimd {
|
||||
pub fn new(config: AimdConfig) -> Self {
|
||||
Self {
|
||||
min_limit: config.aimd_min_limit,
|
||||
max_limit: config.aimd_max_limit,
|
||||
decrease_factor: config.aimd_decrease_factor,
|
||||
increase_by: config.aimd_increase_by,
|
||||
min_utilisation_threshold: config.aimd_min_utilisation_threshold,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl LimitAlgorithm for Aimd {
|
||||
async fn update(&mut self, old_limit: usize, sample: Sample) -> usize {
|
||||
use Outcome::*;
|
||||
match sample.outcome {
|
||||
Success => {
|
||||
let utilisation = sample.in_flight as f32 / old_limit as f32;
|
||||
|
||||
if utilisation > self.min_utilisation_threshold {
|
||||
let limit = old_limit + self.increase_by;
|
||||
limit.clamp(self.min_limit, self.max_limit)
|
||||
} else {
|
||||
old_limit
|
||||
}
|
||||
}
|
||||
Overload => {
|
||||
let limit = old_limit as f32 * self.decrease_factor;
|
||||
|
||||
// Floor instead of round, so the limit reduces even with small numbers.
|
||||
// E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
|
||||
let limit = limit.floor() as usize;
|
||||
|
||||
limit.clamp(self.min_limit, self.max_limit)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::Notify;
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::rate_limiter::{Limiter, RateLimiterConfig};
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_decrease_limit_on_overload() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 10,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let release_notifier = Arc::new(Notify::new());
|
||||
|
||||
let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone());
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
limiter.release(token, Some(Outcome::Overload)).await;
|
||||
release_notifier.notified().await;
|
||||
assert_eq!(limiter.state().limit(), 5, "overload: decrease");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 4,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
aimd_min_utilisation_threshold: 0.5,
|
||||
aimd_increase_by: 1,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
let _token = limiter.try_acquire().unwrap();
|
||||
let _token = limiter.try_acquire().unwrap();
|
||||
|
||||
limiter.release(token, Some(Outcome::Success)).await;
|
||||
assert_eq!(limiter.state().limit(), 5, "success: increase");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 4,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
aimd_min_utilisation_threshold: 0.5,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
|
||||
limiter.release(token, Some(Outcome::Success)).await;
|
||||
assert_eq!(
|
||||
limiter.state().limit(),
|
||||
4,
|
||||
"success: ignore when < half limit"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_not_change_limit_when_no_outcome() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 10,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
aimd_min_utilisation_threshold: 0.5,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
limiter.release(token, None).await;
|
||||
assert_eq!(limiter.state().limit(), 10, "ignore");
|
||||
}
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
//! Algorithms for controlling concurrency limits.
|
||||
use async_trait::async_trait;
|
||||
use std::time::Duration;
|
||||
|
||||
use super::{limiter::Outcome, Aimd};
|
||||
|
||||
/// An algorithm for controlling a concurrency limit.
|
||||
#[async_trait]
|
||||
pub trait LimitAlgorithm: Send + Sync + 'static {
|
||||
/// Update the concurrency limit in response to a new job completion.
|
||||
async fn update(&mut self, old_limit: usize, sample: Sample) -> usize;
|
||||
}
|
||||
|
||||
/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay).
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Sample {
|
||||
pub(crate) latency: Duration,
|
||||
/// Jobs in flight when the sample was taken.
|
||||
pub(crate) in_flight: usize,
|
||||
pub(crate) outcome: Outcome,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
|
||||
pub enum RateLimitAlgorithm {
|
||||
Fixed,
|
||||
#[default]
|
||||
Aimd,
|
||||
}
|
||||
|
||||
pub struct Fixed;
|
||||
|
||||
#[async_trait]
|
||||
impl LimitAlgorithm for Fixed {
|
||||
async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize {
|
||||
old_limit
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct RateLimiterConfig {
|
||||
pub disable: bool,
|
||||
pub algorithm: RateLimitAlgorithm,
|
||||
pub timeout: Duration,
|
||||
pub initial_limit: usize,
|
||||
pub aimd_config: Option<AimdConfig>,
|
||||
}
|
||||
|
||||
impl RateLimiterConfig {
|
||||
pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
|
||||
match self.algorithm {
|
||||
RateLimitAlgorithm::Fixed => Box::new(Fixed),
|
||||
RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RateLimiterConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
disable: true,
|
||||
algorithm: RateLimitAlgorithm::Aimd,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 100,
|
||||
aimd_config: Some(AimdConfig::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(clap::Parser, Clone, Copy, Debug)]
|
||||
pub struct AimdConfig {
|
||||
/// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 1)]
|
||||
pub aimd_min_limit: usize,
|
||||
/// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 1500)]
|
||||
pub aimd_max_limit: usize,
|
||||
/// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 10)]
|
||||
pub aimd_increase_by: usize,
|
||||
/// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 0.9)]
|
||||
pub aimd_decrease_factor: f32,
|
||||
/// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 0.8)]
|
||||
pub aimd_min_utilisation_threshold: f32,
|
||||
}
|
||||
|
||||
impl Default for AimdConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
aimd_min_limit: 1,
|
||||
aimd_max_limit: 1500,
|
||||
aimd_increase_by: 10,
|
||||
aimd_decrease_factor: 0.9,
|
||||
aimd_min_utilisation_threshold: 0.8,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,10 +2,9 @@ use std::{
|
||||
borrow::Cow,
|
||||
collections::hash_map::RandomState,
|
||||
hash::{BuildHasher, Hash},
|
||||
net::IpAddr,
|
||||
sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex,
|
||||
Mutex,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -13,22 +12,10 @@ use anyhow::bail;
|
||||
use dashmap::DashMap;
|
||||
use itertools::Itertools;
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
|
||||
use tokio::time::{timeout, Duration, Instant};
|
||||
use tokio::time::{Duration, Instant};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
intern::EndpointIdInt,
|
||||
{
|
||||
metrics::{Metrics, RateLimit},
|
||||
EndpointId,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
limit_algorithm::{LimitAlgorithm, Sample},
|
||||
RateLimiterConfig,
|
||||
};
|
||||
use crate::intern::EndpointIdInt;
|
||||
|
||||
pub struct GlobalRateLimiter {
|
||||
data: Vec<RateBucket>,
|
||||
@@ -74,15 +61,7 @@ impl GlobalRateLimiter {
|
||||
// Purposefully ignore user name and database name as clients can reconnect
|
||||
// with different names, so we'll end up sending some http requests to
|
||||
// the control plane.
|
||||
//
|
||||
// We also may save quite a lot of CPU (I think) by bailing out right after we
|
||||
// saw SNI, before doing TLS handshake. User-side error messages in that case
|
||||
// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now
|
||||
// I went with a more expensive way that yields user-friendlier error messages.
|
||||
pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
|
||||
|
||||
// This can't be just per IP because that would limit some PaaS that share IP addresses
|
||||
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
|
||||
pub type EndpointRateLimiter = BucketRateLimiter<EndpointIdInt, StdRng, RandomState>;
|
||||
|
||||
pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<Key, Vec<RateBucket>, Hasher>,
|
||||
@@ -155,19 +134,6 @@ impl RateBucketInfo {
|
||||
Self::new(100, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
/// All of these are per endpoint-ip pair.
|
||||
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
|
||||
///
|
||||
/// First bucket: 300mcpus total per endpoint-ip pair
|
||||
/// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
|
||||
/// * 300 requests per second with 4096 hash rounds.
|
||||
/// * 2 requests per second with 600000 hash rounds.
|
||||
pub const DEFAULT_AUTH_SET: [Self; 3] = [
|
||||
Self::new(300 * 4096, Duration::from_secs(1)),
|
||||
Self::new(200 * 4096, Duration::from_secs(60)),
|
||||
Self::new(100 * 4096, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
|
||||
info.sort_unstable_by_key(|info| info.interval);
|
||||
let invalid = info
|
||||
@@ -265,423 +231,16 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Limits the number of concurrent jobs.
|
||||
///
|
||||
/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
|
||||
/// token once the job is finished.
|
||||
///
|
||||
/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
|
||||
/// caused by overload (loss).
|
||||
pub struct Limiter {
|
||||
limit_algo: AsyncMutex<Box<dyn LimitAlgorithm>>,
|
||||
semaphore: std::sync::Arc<Semaphore>,
|
||||
config: RateLimiterConfig,
|
||||
|
||||
// ONLY WRITE WHEN LIMIT_ALGO IS LOCKED
|
||||
limits: AtomicUsize,
|
||||
|
||||
// ONLY USE ATOMIC ADD/SUB
|
||||
in_flight: Arc<AtomicUsize>,
|
||||
|
||||
#[cfg(test)]
|
||||
notifier: Option<std::sync::Arc<tokio::sync::Notify>>,
|
||||
}
|
||||
|
||||
/// A concurrency token, required to run a job.
|
||||
///
|
||||
/// Release the token back to the [Limiter] after the job is complete.
|
||||
#[derive(Debug)]
|
||||
pub struct Token<'t> {
|
||||
permit: Option<tokio::sync::SemaphorePermit<'t>>,
|
||||
start: Instant,
|
||||
in_flight: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
/// A snapshot of the state of the [Limiter].
|
||||
///
|
||||
/// Not guaranteed to be consistent under high concurrency.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct LimiterState {
|
||||
limit: usize,
|
||||
in_flight: usize,
|
||||
}
|
||||
|
||||
/// Whether a job succeeded or failed as a result of congestion/overload.
|
||||
///
|
||||
/// Errors not considered to be caused by overload should be ignored.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Outcome {
|
||||
/// The job succeeded, or failed in a way unrelated to overload.
|
||||
Success,
|
||||
/// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
|
||||
/// was observed.
|
||||
Overload,
|
||||
}
|
||||
|
||||
impl Outcome {
|
||||
fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self {
|
||||
match error {
|
||||
reqwest_middleware::Error::Middleware(_) => Outcome::Success,
|
||||
reqwest_middleware::Error::Reqwest(e) => {
|
||||
if let Some(status) = e.status() {
|
||||
if status.is_server_error()
|
||||
|| reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status
|
||||
{
|
||||
Outcome::Overload
|
||||
} else {
|
||||
Outcome::Success
|
||||
}
|
||||
} else {
|
||||
Outcome::Success
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fn from_reqwest_response(response: &reqwest::Response) -> Self {
|
||||
if response.status().is_server_error()
|
||||
|| response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS
|
||||
{
|
||||
Outcome::Overload
|
||||
} else {
|
||||
Outcome::Success
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Limiter {
|
||||
/// Create a limiter with a given limit control algorithm.
|
||||
pub fn new(config: RateLimiterConfig) -> Self {
|
||||
assert!(config.initial_limit > 0);
|
||||
Self {
|
||||
limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()),
|
||||
semaphore: Arc::new(Semaphore::new(config.initial_limit)),
|
||||
config,
|
||||
limits: AtomicUsize::new(config.initial_limit),
|
||||
in_flight: Arc::new(AtomicUsize::new(0)),
|
||||
#[cfg(test)]
|
||||
notifier: None,
|
||||
}
|
||||
}
|
||||
// pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self {
|
||||
// assert!(initial_limit > 0);
|
||||
|
||||
// Self {
|
||||
// limit_algo: AsyncMutex::new(limit_algorithm),
|
||||
// semaphore: Arc::new(Semaphore::new(initial_limit)),
|
||||
// timeout,
|
||||
// limits: AtomicUsize::new(initial_limit),
|
||||
// in_flight: Arc::new(AtomicUsize::new(0)),
|
||||
// #[cfg(test)]
|
||||
// notifier: None,
|
||||
// }
|
||||
// }
|
||||
|
||||
/// In some cases [Token]s are acquired asynchronously when updating the limit.
|
||||
#[cfg(test)]
|
||||
pub fn with_release_notifier(mut self, n: std::sync::Arc<tokio::sync::Notify>) -> Self {
|
||||
self.notifier = Some(n);
|
||||
self
|
||||
}
|
||||
|
||||
/// Try to immediately acquire a concurrency [Token].
|
||||
///
|
||||
/// Returns `None` if there are none available.
|
||||
pub fn try_acquire(&self) -> Option<Token> {
|
||||
let result = if self.config.disable {
|
||||
// If the rate limiter is disabled, we can always acquire a token.
|
||||
Some(Token::new(None, self.in_flight.clone()))
|
||||
} else {
|
||||
self.semaphore
|
||||
.try_acquire()
|
||||
.map(|permit| Token::new(Some(permit), self.in_flight.clone()))
|
||||
.ok()
|
||||
};
|
||||
if result.is_some() {
|
||||
self.in_flight.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
|
||||
///
|
||||
/// Returns `None` if there are none available after `duration`.
|
||||
pub async fn acquire_timeout(&self, duration: Duration) -> Option<Token<'_>> {
|
||||
info!("acquiring token: {:?}", self.semaphore.available_permits());
|
||||
let result = if self.config.disable {
|
||||
// If the rate limiter is disabled, we can always acquire a token.
|
||||
Some(Token::new(None, self.in_flight.clone()))
|
||||
} else {
|
||||
match timeout(duration, self.semaphore.acquire()).await {
|
||||
Ok(maybe_permit) => maybe_permit
|
||||
.map(|permit| Token::new(Some(permit), self.in_flight.clone()))
|
||||
.ok(),
|
||||
Err(_) => None,
|
||||
}
|
||||
};
|
||||
if result.is_some() {
|
||||
self.in_flight.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Return the concurrency [Token], along with the outcome of the job.
|
||||
///
|
||||
/// The [Outcome] of the job, and the time taken to perform it, may be used
|
||||
/// to update the concurrency limit.
|
||||
///
|
||||
/// Set the outcome to `None` to ignore the job.
|
||||
pub async fn release(&self, mut token: Token<'_>, outcome: Option<Outcome>) {
|
||||
tracing::info!("outcome is {:?}", outcome);
|
||||
let in_flight = self.in_flight.load(Ordering::Acquire);
|
||||
let old_limit = self.limits.load(Ordering::Acquire);
|
||||
let available = if self.config.disable {
|
||||
0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0.
|
||||
} else {
|
||||
self.semaphore.available_permits()
|
||||
};
|
||||
let total = in_flight + available;
|
||||
|
||||
let mut algo = self.limit_algo.lock().await;
|
||||
|
||||
let new_limit = if let Some(outcome) = outcome {
|
||||
let sample = Sample {
|
||||
latency: token.start.elapsed(),
|
||||
in_flight,
|
||||
outcome,
|
||||
};
|
||||
algo.update(old_limit, sample).await
|
||||
} else {
|
||||
old_limit
|
||||
};
|
||||
tracing::info!("new limit is {}", new_limit);
|
||||
let actual_limit = if new_limit < total {
|
||||
token.forget();
|
||||
total.saturating_sub(1)
|
||||
} else {
|
||||
if !self.config.disable {
|
||||
self.semaphore.add_permits(new_limit.saturating_sub(total));
|
||||
}
|
||||
new_limit
|
||||
};
|
||||
let metric = &Metrics::get().semaphore_control_plane_limit;
|
||||
metric.set(RateLimit::Expected, new_limit as i64);
|
||||
metric.set(RateLimit::Actual, actual_limit as i64);
|
||||
self.limits.store(new_limit, Ordering::Release);
|
||||
#[cfg(test)]
|
||||
if let Some(n) = &self.notifier {
|
||||
n.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
/// The current state of the limiter.
|
||||
pub fn state(&self) -> LimiterState {
|
||||
let limit = self.limits.load(Ordering::Relaxed);
|
||||
let in_flight = self.in_flight.load(Ordering::Relaxed);
|
||||
LimiterState { limit, in_flight }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Token<'t> {
|
||||
fn new(permit: Option<SemaphorePermit<'t>>, in_flight: Arc<AtomicUsize>) -> Self {
|
||||
Self {
|
||||
permit,
|
||||
start: Instant::now(),
|
||||
in_flight,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forget(&mut self) {
|
||||
if let Some(permit) = self.permit.take() {
|
||||
permit.forget();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Token<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.in_flight.fetch_sub(1, Ordering::AcqRel);
|
||||
}
|
||||
}
|
||||
|
||||
impl LimiterState {
|
||||
/// The current concurrency limit.
|
||||
pub fn limit(&self) -> usize {
|
||||
self.limit
|
||||
}
|
||||
/// The number of jobs in flight.
|
||||
pub fn in_flight(&self) -> usize {
|
||||
self.in_flight
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl reqwest_middleware::Middleware for Limiter {
|
||||
async fn handle(
|
||||
&self,
|
||||
req: reqwest::Request,
|
||||
extensions: &mut task_local_extensions::Extensions,
|
||||
next: reqwest_middleware::Next<'_>,
|
||||
) -> reqwest_middleware::Result<reqwest::Response> {
|
||||
let timer = Metrics::get()
|
||||
.proxy
|
||||
.control_plane_token_acquire_seconds
|
||||
.start_timer();
|
||||
let token = self
|
||||
.acquire_timeout(self.config.timeout)
|
||||
.await
|
||||
.ok_or_else(|| {
|
||||
reqwest_middleware::Error::Middleware(
|
||||
// TODO: Should we map it into user facing errors?
|
||||
crate::console::errors::ApiError::Console {
|
||||
status: crate::http::StatusCode::TOO_MANY_REQUESTS,
|
||||
text: "Too many requests".into(),
|
||||
}
|
||||
.into(),
|
||||
)
|
||||
})?;
|
||||
let duration = timer.observe();
|
||||
info!(
|
||||
?duration,
|
||||
"waiting for token to connect to the control plane"
|
||||
);
|
||||
|
||||
match next.run(req, extensions).await {
|
||||
Ok(response) => {
|
||||
self.release(token, Some(Outcome::from_reqwest_response(&response)))
|
||||
.await;
|
||||
Ok(response)
|
||||
}
|
||||
Err(e) => {
|
||||
self.release(token, Some(Outcome::from_reqwest_error(&e)))
|
||||
.await;
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
|
||||
use std::{hash::BuildHasherDefault, time::Duration};
|
||||
|
||||
use futures::{task::noop_waker_ref, Future};
|
||||
use rand::SeedableRng;
|
||||
use rustc_hash::FxHasher;
|
||||
use tokio::time;
|
||||
|
||||
use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
|
||||
use crate::{
|
||||
rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
|
||||
EndpointId,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn it_works() {
|
||||
let config = super::RateLimiterConfig {
|
||||
algorithm: RateLimitAlgorithm::Fixed,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 10,
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
|
||||
limiter.release(token, Some(Outcome::Success)).await;
|
||||
|
||||
assert_eq!(limiter.state().limit(), 10);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn is_fair() {
|
||||
let config = super::RateLimiterConfig {
|
||||
algorithm: RateLimitAlgorithm::Fixed,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 1,
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
// === TOKEN 1 ===
|
||||
let token1 = limiter.try_acquire().unwrap();
|
||||
|
||||
let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
|
||||
assert!(
|
||||
token2_fut
|
||||
.as_mut()
|
||||
.poll(&mut Context::from_waker(noop_waker_ref()))
|
||||
.is_pending(),
|
||||
"token is acquired by token1"
|
||||
);
|
||||
|
||||
let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
|
||||
assert!(
|
||||
token3_fut
|
||||
.as_mut()
|
||||
.poll(&mut Context::from_waker(noop_waker_ref()))
|
||||
.is_pending(),
|
||||
"token is acquired by token1"
|
||||
);
|
||||
|
||||
limiter.release(token1, Some(Outcome::Success)).await;
|
||||
// === END TOKEN 1 ===
|
||||
|
||||
// === TOKEN 2 ===
|
||||
assert!(
|
||||
limiter.try_acquire().is_none(),
|
||||
"token is acquired by token2"
|
||||
);
|
||||
|
||||
assert!(
|
||||
token3_fut
|
||||
.as_mut()
|
||||
.poll(&mut Context::from_waker(noop_waker_ref()))
|
||||
.is_pending(),
|
||||
"token is acquired by token2"
|
||||
);
|
||||
|
||||
let token2 = token2_fut.await.unwrap();
|
||||
|
||||
limiter.release(token2, Some(Outcome::Success)).await;
|
||||
// === END TOKEN 2 ===
|
||||
|
||||
// === TOKEN 3 ===
|
||||
assert!(
|
||||
limiter.try_acquire().is_none(),
|
||||
"token is acquired by token3"
|
||||
);
|
||||
|
||||
let token3 = token3_fut.await.unwrap();
|
||||
limiter.release(token3, Some(Outcome::Success)).await;
|
||||
// === END TOKEN 3 ===
|
||||
|
||||
// === TOKEN 4 ===
|
||||
let token4 = limiter.try_acquire().unwrap();
|
||||
limiter.release(token4, Some(Outcome::Success)).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn disable() {
|
||||
let config = super::RateLimiterConfig {
|
||||
algorithm: RateLimitAlgorithm::Fixed,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 1,
|
||||
disable: true,
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
// === TOKEN 1 ===
|
||||
let token1 = limiter.try_acquire().unwrap();
|
||||
let token2 = limiter.try_acquire().unwrap();
|
||||
let state = limiter.state();
|
||||
assert_eq!(state.limit(), 1);
|
||||
assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected.
|
||||
limiter.release(token1, None).await;
|
||||
limiter.release(token2, None).await;
|
||||
}
|
||||
use super::{BucketRateLimiter, EndpointRateLimiter};
|
||||
use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
|
||||
|
||||
#[test]
|
||||
fn rate_bucket_rpi() {
|
||||
@@ -731,39 +290,40 @@ mod tests {
|
||||
let limiter = EndpointRateLimiter::new(rates);
|
||||
|
||||
let endpoint = EndpointId::from("ep-my-endpoint-1234");
|
||||
let endpoint = EndpointIdInt::from(endpoint);
|
||||
|
||||
time::pause();
|
||||
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone(), 1));
|
||||
assert!(limiter.check(endpoint, 1));
|
||||
}
|
||||
// more connections fail
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint, 1));
|
||||
|
||||
// fail even after 500ms as it's in the same bucket
|
||||
time::advance(time::Duration::from_millis(500)).await;
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint, 1));
|
||||
|
||||
// after a full 1s, 100 requests are allowed again
|
||||
time::advance(time::Duration::from_millis(500)).await;
|
||||
for _ in 1..6 {
|
||||
for _ in 0..50 {
|
||||
assert!(limiter.check(endpoint.clone(), 2));
|
||||
assert!(limiter.check(endpoint, 2));
|
||||
}
|
||||
time::advance(time::Duration::from_millis(1000)).await;
|
||||
}
|
||||
|
||||
// more connections after 600 will exceed the 20rps@30s limit
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint, 1));
|
||||
|
||||
// will still fail before the 30 second limit
|
||||
time::advance(time::Duration::from_millis(30_000 - 6_000 - 1)).await;
|
||||
assert!(!limiter.check(endpoint.clone(), 1));
|
||||
assert!(!limiter.check(endpoint, 1));
|
||||
|
||||
// after the full 30 seconds, 100 requests are allowed again
|
||||
time::advance(time::Duration::from_millis(1)).await;
|
||||
for _ in 0..100 {
|
||||
assert!(limiter.check(endpoint.clone(), 1));
|
||||
assert!(limiter.check(endpoint, 1));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -783,31 +343,4 @@ mod tests {
|
||||
}
|
||||
assert!(limiter.map.len() < 150_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_auth_set() {
|
||||
// these values used to exceed u32::MAX
|
||||
assert_eq!(
|
||||
RateBucketInfo::DEFAULT_AUTH_SET,
|
||||
[
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(1),
|
||||
max_rpi: 300 * 4096,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(60),
|
||||
max_rpi: 200 * 4096 * 60,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(600),
|
||||
max_rpi: 100 * 4096 * 600,
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
for x in RateBucketInfo::DEFAULT_AUTH_SET {
|
||||
let y = x.to_string().parse().unwrap();
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,10 +77,14 @@ impl ConnectionWithCredentialsProvider {
|
||||
}
|
||||
}
|
||||
|
||||
async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
|
||||
redis::cmd("PING").query_async(con).await
|
||||
}
|
||||
|
||||
pub async fn connect(&mut self) -> anyhow::Result<()> {
|
||||
let _guard = self.mutex.lock().await;
|
||||
if let Some(con) = self.con.as_mut() {
|
||||
match redis::cmd("PING").query_async(con).await {
|
||||
match Self::ping(con).await {
|
||||
Ok(()) => {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -96,7 +100,7 @@ impl ConnectionWithCredentialsProvider {
|
||||
if let Some(f) = self.refresh_token_task.take() {
|
||||
f.abort()
|
||||
}
|
||||
let con = self
|
||||
let mut con = self
|
||||
.get_client()
|
||||
.await?
|
||||
.get_multiplexed_tokio_connection()
|
||||
@@ -109,6 +113,14 @@ impl ConnectionWithCredentialsProvider {
|
||||
});
|
||||
self.refresh_token_task = Some(f);
|
||||
}
|
||||
match Self::ping(&mut con).await {
|
||||
Ok(()) => {
|
||||
info!("Connection succesfully established");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Connection is broken. Error during PING: {e:?}");
|
||||
}
|
||||
}
|
||||
self.con = Some(con);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -35,7 +35,6 @@ use crate::context::RequestMonitoring;
|
||||
use crate::metrics::Metrics;
|
||||
use crate::protocol2::WithClientIp;
|
||||
use crate::proxy::run_until_cancelled;
|
||||
use crate::rate_limiter::EndpointRateLimiter;
|
||||
use crate::serverless::backend::PoolingBackend;
|
||||
use crate::serverless::http_util::{api_error_into_response, json_response};
|
||||
|
||||
@@ -53,7 +52,6 @@ pub async fn task_main(
|
||||
config: &'static ProxyConfig,
|
||||
ws_listener: TcpListener,
|
||||
cancellation_token: CancellationToken,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
) -> anyhow::Result<()> {
|
||||
scopeguard::defer! {
|
||||
@@ -117,7 +115,6 @@ pub async fn task_main(
|
||||
backend.clone(),
|
||||
connections.clone(),
|
||||
cancellation_handler.clone(),
|
||||
endpoint_rate_limiter.clone(),
|
||||
cancellation_token.clone(),
|
||||
server.clone(),
|
||||
tls_acceptor.clone(),
|
||||
@@ -147,7 +144,6 @@ async fn connection_handler(
|
||||
backend: Arc<PoolingBackend>,
|
||||
connections: TaskTracker,
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
cancellation_token: CancellationToken,
|
||||
server: Builder<TokioExecutor>,
|
||||
tls_acceptor: TlsAcceptor,
|
||||
@@ -172,6 +168,10 @@ async fn connection_handler(
|
||||
};
|
||||
|
||||
let peer_addr = peer.unwrap_or(peer_addr).ip();
|
||||
let has_private_peer_addr = match peer_addr {
|
||||
IpAddr::V4(ip) => ip.is_private(),
|
||||
_ => false,
|
||||
};
|
||||
info!(?session_id, %peer_addr, "accepted new TCP connection");
|
||||
|
||||
// try upgrade to TLS, but with a timeout.
|
||||
@@ -182,13 +182,17 @@ async fn connection_handler(
|
||||
}
|
||||
// The handshake failed
|
||||
Ok(Err(e)) => {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
if !has_private_peer_addr {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
}
|
||||
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
|
||||
return;
|
||||
}
|
||||
// The handshake timed out
|
||||
Err(e) => {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
if !has_private_peer_addr {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
}
|
||||
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
|
||||
return;
|
||||
}
|
||||
@@ -223,7 +227,6 @@ async fn connection_handler(
|
||||
cancellation_handler.clone(),
|
||||
session_id,
|
||||
peer_addr,
|
||||
endpoint_rate_limiter.clone(),
|
||||
http_request_token,
|
||||
)
|
||||
.in_current_span()
|
||||
@@ -262,7 +265,6 @@ async fn request_handler(
|
||||
cancellation_handler: Arc<CancellationHandlerMain>,
|
||||
session_id: uuid::Uuid,
|
||||
peer_addr: IpAddr,
|
||||
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
|
||||
// used to cancel in-flight HTTP requests. not used to cancel websockets
|
||||
http_cancellation_token: CancellationToken,
|
||||
) -> Result<Response<Full<Bytes>>, ApiError> {
|
||||
@@ -290,15 +292,9 @@ async fn request_handler(
|
||||
|
||||
ws_connections.spawn(
|
||||
async move {
|
||||
if let Err(e) = websocket::serve_websocket(
|
||||
config,
|
||||
ctx,
|
||||
websocket,
|
||||
cancellation_handler,
|
||||
host,
|
||||
endpoint_rate_limiter,
|
||||
)
|
||||
.await
|
||||
if let Err(e) =
|
||||
websocket::serve_websocket(config, ctx, websocket, cancellation_handler, host)
|
||||
.await
|
||||
{
|
||||
error!("error in websocket connection: {e:#}");
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use tracing::{field::display, info};
|
||||
use crate::{
|
||||
auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
|
||||
compute,
|
||||
config::ProxyConfig,
|
||||
config::{AuthenticationConfig, ProxyConfig},
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
CachedNodeInfo,
|
||||
@@ -27,6 +27,7 @@ impl PoolingBackend {
|
||||
pub async fn authenticate(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
conn_info: &ConnInfo,
|
||||
) -> Result<ComputeCredentials, AuthError> {
|
||||
let user_info = conn_info.user_info.clone();
|
||||
@@ -43,6 +44,7 @@ impl PoolingBackend {
|
||||
let secret = match cached_secret.value.clone() {
|
||||
Some(secret) => self.config.authentication_config.check_rate_limit(
|
||||
ctx,
|
||||
config,
|
||||
secret,
|
||||
&user_info.endpoint,
|
||||
true,
|
||||
@@ -106,6 +108,8 @@ impl PoolingBackend {
|
||||
},
|
||||
&backend,
|
||||
false, // do not allow self signed compute for http flow
|
||||
self.config.wake_compute_retry_config,
|
||||
self.config.connect_to_compute_retry_config,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -541,7 +541,9 @@ async fn handle_inner(
|
||||
.map_err(SqlOverHttpError::from);
|
||||
|
||||
let authenticate_and_connect = async {
|
||||
let keys = backend.authenticate(ctx, &conn_info).await?;
|
||||
let keys = backend
|
||||
.authenticate(ctx, &config.authentication_config, &conn_info)
|
||||
.await?;
|
||||
let client = backend
|
||||
.connect_to_compute(ctx, conn_info, keys, !allow_pool)
|
||||
.await?;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user