mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 17:10:38 +00:00
Compare commits
20 Commits
skyzh/key-
...
problame/w
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8b10701a0f | ||
|
|
b937432a04 | ||
|
|
a6d414d6e6 | ||
|
|
0f7f743f37 | ||
|
|
d5708e7435 | ||
|
|
fd49005cb3 | ||
|
|
3023de156e | ||
|
|
e49e931bc4 | ||
|
|
13b9135d4e | ||
|
|
41bb1e42b8 | ||
|
|
cb4b40f9c1 | ||
|
|
9e567d9814 | ||
|
|
1c012958c7 | ||
|
|
e5c50bb12b | ||
|
|
926662eb7c | ||
|
|
3366cd34ba | ||
|
|
2d5a8462c8 | ||
|
|
110282ee7e | ||
|
|
f752c40f58 | ||
|
|
83cdbbb89a |
2
.github/workflows/build_and_test.yml
vendored
2
.github/workflows/build_and_test.yml
vendored
@@ -1133,8 +1133,6 @@ jobs:
|
||||
-f deployPreprodRegion=true
|
||||
|
||||
gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
|
||||
-f deployPgSniRouter=false \
|
||||
-f deployProxy=false \
|
||||
-f deployStorage=true \
|
||||
-f deployStorageBroker=true \
|
||||
-f deployStorageController=true \
|
||||
|
||||
@@ -28,7 +28,9 @@ jobs:
|
||||
- name: Get build-tools image tag for the current commit
|
||||
id: get-build-tools-tag
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
# Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
|
||||
# we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
LAST_BUILD_TOOLS_SHA=$(
|
||||
|
||||
@@ -818,9 +818,15 @@ impl ComputeNode {
|
||||
Client::connect(zenith_admin_connstr.as_str(), NoTls)
|
||||
.context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
|
||||
// Disable forwarding so that users don't get a cloud_admin role
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
|
||||
let mut func = || {
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("apply_config setup cloud_admin")?;
|
||||
|
||||
drop(client);
|
||||
|
||||
// reconnect with connstring with expected name
|
||||
@@ -832,24 +838,29 @@ impl ComputeNode {
|
||||
};
|
||||
|
||||
// Disable DDL forwarding because control plane already knows about these roles/databases.
|
||||
client.simple_query("SET neon.forward_ddl = false")?;
|
||||
client
|
||||
.simple_query("SET neon.forward_ddl = false")
|
||||
.context("apply_config SET neon.forward_ddl = false")?;
|
||||
|
||||
// Proceed with post-startup configuration. Note, that order of operations is important.
|
||||
let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
|
||||
create_neon_superuser(spec, &mut client)?;
|
||||
cleanup_instance(&mut client)?;
|
||||
handle_roles(spec, &mut client)?;
|
||||
handle_databases(spec, &mut client)?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)?;
|
||||
create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
|
||||
cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
|
||||
handle_roles(spec, &mut client).context("apply_config handle_roles")?;
|
||||
handle_databases(spec, &mut client).context("apply_config handle_databases")?;
|
||||
handle_role_deletions(spec, connstr.as_str(), &mut client)
|
||||
.context("apply_config handle_role_deletions")?;
|
||||
handle_grants(
|
||||
spec,
|
||||
&mut client,
|
||||
connstr.as_str(),
|
||||
self.has_feature(ComputeFeature::AnonExtension),
|
||||
)?;
|
||||
handle_extensions(spec, &mut client)?;
|
||||
handle_extension_neon(&mut client)?;
|
||||
create_availability_check_data(&mut client)?;
|
||||
)
|
||||
.context("apply_config handle_grants")?;
|
||||
handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
|
||||
handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
|
||||
create_availability_check_data(&mut client)
|
||||
.context("apply_config create_availability_check_data")?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
@@ -857,7 +868,7 @@ impl ComputeNode {
|
||||
// Run migrations separately to not hold up cold starts
|
||||
thread::spawn(move || {
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
handle_migrations(&mut client)
|
||||
handle_migrations(&mut client).context("apply_config handle_migrations")
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::fs::File;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use postgres::config::Config;
|
||||
use postgres::{Client, NoTls};
|
||||
use reqwest::StatusCode;
|
||||
@@ -698,7 +698,8 @@ pub fn handle_grants(
|
||||
|
||||
// it is important to run this after all grants
|
||||
if enable_anon_extension {
|
||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
|
||||
handle_extension_anon(spec, &db.owner, &mut db_client, false)
|
||||
.context("handle_grants handle_extension_anon")?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -813,28 +814,36 @@ $$;"#,
|
||||
// Add new migrations below.
|
||||
];
|
||||
|
||||
let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
let mut func = || {
|
||||
let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
|
||||
client.simple_query(query)?;
|
||||
|
||||
query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
|
||||
client.simple_query(query)?;
|
||||
Ok::<_, anyhow::Error>(())
|
||||
};
|
||||
func().context("handle_migrations prepare")?;
|
||||
|
||||
query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client.query_one(query, &[])?;
|
||||
let query = "SELECT id FROM neon_migration.migration_id";
|
||||
let row = client
|
||||
.query_one(query, &[])
|
||||
.context("handle_migrations get migration_id")?;
|
||||
let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
|
||||
let starting_migration_id = current_migration;
|
||||
|
||||
query = "BEGIN";
|
||||
client.simple_query(query)?;
|
||||
let query = "BEGIN";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations begin")?;
|
||||
|
||||
while current_migration < migrations.len() {
|
||||
let migration = &migrations[current_migration];
|
||||
@@ -842,7 +851,9 @@ $$;"#,
|
||||
info!("Skip migration id={}", current_migration);
|
||||
} else {
|
||||
info!("Running migration:\n{}\n", migration);
|
||||
client.simple_query(migration)?;
|
||||
client.simple_query(migration).with_context(|| {
|
||||
format!("handle_migrations current_migration={}", current_migration)
|
||||
})?;
|
||||
}
|
||||
current_migration += 1;
|
||||
}
|
||||
@@ -850,10 +861,14 @@ $$;"#,
|
||||
"UPDATE neon_migration.migration_id SET id={}",
|
||||
migrations.len()
|
||||
);
|
||||
client.simple_query(&setval)?;
|
||||
client
|
||||
.simple_query(&setval)
|
||||
.context("handle_migrations update id")?;
|
||||
|
||||
query = "COMMIT";
|
||||
client.simple_query(query)?;
|
||||
let query = "COMMIT";
|
||||
client
|
||||
.simple_query(query)
|
||||
.context("handle_migrations commit")?;
|
||||
|
||||
info!(
|
||||
"Ran {} migrations",
|
||||
|
||||
@@ -1417,6 +1417,7 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("timeline")
|
||||
.about("Manage timelines")
|
||||
.arg_required_else_help(true)
|
||||
.subcommand(Command::new("list")
|
||||
.about("List all timelines, available to this pageserver")
|
||||
.arg(tenant_id_arg.clone()))
|
||||
|
||||
@@ -156,6 +156,7 @@ pub struct SafekeeperConf {
|
||||
pub remote_storage: Option<String>,
|
||||
pub backup_threads: Option<u32>,
|
||||
pub auth_enabled: bool,
|
||||
pub listen_addr: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for SafekeeperConf {
|
||||
@@ -169,6 +170,7 @@ impl Default for SafekeeperConf {
|
||||
remote_storage: None,
|
||||
backup_threads: None,
|
||||
auth_enabled: false,
|
||||
listen_addr: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
|
||||
pub pg_connection_config: PgConnectionConfig,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: reqwest::Client,
|
||||
pub listen_addr: String,
|
||||
pub http_base_url: String,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
|
||||
listen_addr.clone()
|
||||
} else {
|
||||
"127.0.0.1".to_string()
|
||||
};
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
||||
pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: reqwest::Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
|
||||
listen_addr,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to this safekeeper.
|
||||
fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
|
||||
PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
|
||||
fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
|
||||
PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
|
||||
}
|
||||
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||
@@ -111,8 +118,8 @@ impl SafekeeperNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
|
||||
let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
|
||||
let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
|
||||
let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
|
||||
let id = self.id;
|
||||
let datadir = self.datadir_path();
|
||||
|
||||
@@ -139,7 +146,7 @@ impl SafekeeperNode {
|
||||
availability_zone,
|
||||
];
|
||||
if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
|
||||
let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
|
||||
let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
|
||||
args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
|
||||
}
|
||||
if !self.conf.sync {
|
||||
|
||||
@@ -747,10 +747,18 @@ pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerProcessStatus {
|
||||
pub pid: u32,
|
||||
/// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
|
||||
/// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
|
||||
pub kind: Cow<'static, str>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRedoManagerStatus {
|
||||
pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
|
||||
pub pid: Option<u32>,
|
||||
pub process: Option<WalRedoManagerProcessStatus>,
|
||||
}
|
||||
|
||||
/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
|
||||
|
||||
@@ -8,12 +8,89 @@ use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::id::TenantId;
|
||||
|
||||
/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
|
||||
///
|
||||
/// This module contains a variety of types used to represent the concept of sharding
|
||||
/// a Neon tenant across multiple physical shards. Since there are quite a few of these,
|
||||
/// we provide an summary here.
|
||||
///
|
||||
/// Types used to describe shards:
|
||||
/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
|
||||
/// which identifies a tenant which is not shard-aware. This means its storage paths do not include
|
||||
/// a shard suffix.
|
||||
/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
|
||||
/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
|
||||
/// without the tenant ID. This is useful for things that are implicitly scoped to a particular
|
||||
/// tenant, such as layer files.
|
||||
/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
|
||||
/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
|
||||
/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
|
||||
/// four hex digits. An unsharded tenant is `0000`.
|
||||
/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
|
||||
///
|
||||
/// Types used to describe the parameters for data distribution in a sharded tenant:
|
||||
/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
|
||||
/// multiple shards. Its value is given in 8kiB pages.
|
||||
/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
|
||||
/// always zero: this is provided for future upgrades that might introduce different
|
||||
/// data distribution schemes.
|
||||
///
|
||||
/// Examples:
|
||||
/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
|
||||
/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
|
||||
/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
|
||||
/// and their slugs are 0004, 0104, 0204, and 0304.
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardNumber(pub u8);
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
pub struct ShardCount(u8);
|
||||
|
||||
/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant,
|
||||
/// when we need to know which shard we're dealing with, but do not need to know the full
|
||||
/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
|
||||
/// the fully qualified TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
|
||||
/// and to check whether that [`ShardNumber`] is the same as the current shard.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
/// Formatting helper, for generating the `shard_id` label in traces.
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
/// TenantShardId globally identifies a particular shard in a particular tenant.
|
||||
///
|
||||
/// These are written as `<TenantId>-<ShardSlug>`, for example:
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
|
||||
/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
|
||||
/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
|
||||
///
|
||||
/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible with TenantId: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardCount {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
|
||||
@@ -38,6 +115,7 @@ impl ShardCount {
|
||||
self.0
|
||||
}
|
||||
|
||||
///
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.0 == 0
|
||||
}
|
||||
@@ -53,33 +131,6 @@ impl ShardNumber {
|
||||
pub const MAX: Self = Self(u8::MAX);
|
||||
}
|
||||
|
||||
/// TenantShardId identify the units of work for the Pageserver.
|
||||
///
|
||||
/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
|
||||
///
|
||||
/// # The second shard in a two-shard tenant
|
||||
/// 072f1291a5310026820b2fe4b2968934-0102
|
||||
///
|
||||
/// Historically, tenants could not have multiple shards, and were identified
|
||||
/// by TenantId. To support this, TenantShardId has a special legacy
|
||||
/// mode where `shard_count` is equal to zero: this represents a single-sharded
|
||||
/// tenant which should be written as a TenantId with no suffix.
|
||||
///
|
||||
/// The human-readable encoding of TenantShardId, such as used in API URLs,
|
||||
/// is both forward and backward compatible: a legacy TenantId can be
|
||||
/// decoded as a TenantShardId, and when re-encoded it will be parseable
|
||||
/// as a TenantId.
|
||||
///
|
||||
/// Note that the binary encoding is _not_ backward compatible, because
|
||||
/// at the time sharding is introduced, there are no existing binary structures
|
||||
/// containing TenantId that we need to handle.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TenantShardId {
|
||||
pub tenant_id: TenantId,
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl TenantShardId {
|
||||
pub fn unsharded(tenant_id: TenantId) -> Self {
|
||||
Self {
|
||||
@@ -111,10 +162,13 @@ impl TenantShardId {
|
||||
}
|
||||
|
||||
/// Convenience for code that has special behavior on the 0th shard.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0)
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
|
||||
}
|
||||
@@ -150,9 +204,6 @@ impl TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// Formatting helper
|
||||
struct ShardSlug<'a>(&'a TenantShardId);
|
||||
|
||||
impl<'a> std::fmt::Display for ShardSlug<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
write!(
|
||||
@@ -222,16 +273,6 @@ impl From<[u8; 18]> for TenantShardId {
|
||||
}
|
||||
}
|
||||
|
||||
/// For use within the context of a particular tenant, when we need to know which
|
||||
/// shard we're dealing with, but do not need to know the full ShardIdentity (because
|
||||
/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
|
||||
/// TenantShardId.
|
||||
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct ShardIndex {
|
||||
pub shard_number: ShardNumber,
|
||||
pub shard_count: ShardCount,
|
||||
}
|
||||
|
||||
impl ShardIndex {
|
||||
pub fn new(number: ShardNumber, count: ShardCount) -> Self {
|
||||
Self {
|
||||
@@ -246,6 +287,9 @@ impl ShardIndex {
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
|
||||
}
|
||||
@@ -313,6 +357,8 @@ impl Serialize for TenantShardId {
|
||||
if serializer.is_human_readable() {
|
||||
serializer.collect_str(self)
|
||||
} else {
|
||||
// Note: while human encoding of [`TenantShardId`] is backward and forward
|
||||
// compatible, this binary encoding is not.
|
||||
let mut packed: [u8; 18] = [0; 18];
|
||||
packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
|
||||
packed[16] = self.shard_number.0;
|
||||
@@ -390,16 +436,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
|
||||
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
|
||||
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
|
||||
|
||||
/// The ShardIdentity contains the information needed for one member of map
|
||||
/// to resolve a key to a shard, and then check whether that shard is ==self.
|
||||
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
|
||||
pub struct ShardIdentity {
|
||||
pub number: ShardNumber,
|
||||
pub count: ShardCount,
|
||||
pub stripe_size: ShardStripeSize,
|
||||
layout: ShardLayout,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
|
||||
pub enum ShardConfigError {
|
||||
#[error("Invalid shard count")]
|
||||
@@ -439,6 +475,9 @@ impl ShardIdentity {
|
||||
}
|
||||
}
|
||||
|
||||
/// The "unsharded" value is distinct from simply having a single shard: it represents
|
||||
/// a tenant which is not shard-aware at all, and whose storage paths will not include
|
||||
/// a shard suffix.
|
||||
pub fn is_unsharded(&self) -> bool {
|
||||
self.number == ShardNumber(0) && self.count == ShardCount(0)
|
||||
}
|
||||
@@ -487,6 +526,8 @@ impl ShardIdentity {
|
||||
}
|
||||
|
||||
/// Return true if the key should be ingested by this shard
|
||||
///
|
||||
/// Shards must ingest _at least_ keys which return true from this check.
|
||||
pub fn is_key_local(&self, key: &Key) -> bool {
|
||||
assert!(!self.is_broken());
|
||||
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
|
||||
@@ -497,7 +538,9 @@ impl ShardIdentity {
|
||||
}
|
||||
|
||||
/// Return true if the key should be discarded if found in this shard's
|
||||
/// data store, e.g. during compaction after a split
|
||||
/// data store, e.g. during compaction after a split.
|
||||
///
|
||||
/// Shards _may_ drop keys which return false here, but are not obliged to.
|
||||
pub fn is_key_disposable(&self, key: &Key) -> bool {
|
||||
if key_is_shard0(key) {
|
||||
// Q: Why can't we dispose of shard0 content if we're not shard 0?
|
||||
@@ -523,7 +566,7 @@ impl ShardIdentity {
|
||||
|
||||
/// Convenience for checking if this identity is the 0th shard in a tenant,
|
||||
/// for special cases on shard 0 such as ingesting relation sizes.
|
||||
pub fn is_zero(&self) -> bool {
|
||||
pub fn is_shard_zero(&self) -> bool {
|
||||
self.number == ShardNumber(0)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,6 +92,8 @@ pub mod zstd;
|
||||
|
||||
pub mod env;
|
||||
|
||||
pub mod poison;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
|
||||
121
libs/utils/src/poison.rs
Normal file
121
libs/utils/src/poison.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
//! Protect a piece of state from reuse after it is left in an inconsistent state.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! # tokio_test::block_on(async {
|
||||
//! use utils::poison::Poison;
|
||||
//! use std::time::Duration;
|
||||
//!
|
||||
//! struct State {
|
||||
//! clean: bool,
|
||||
//! }
|
||||
//! let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
|
||||
//!
|
||||
//! let mut mutex_guard = state.lock().await;
|
||||
//! let mut poison_guard = mutex_guard.check_and_arm()?;
|
||||
//! let state = poison_guard.data_mut();
|
||||
//! state.clean = false;
|
||||
//! // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
|
||||
//! tokio::time::sleep(Duration::from_secs(10)).await;
|
||||
//! state.clean = true;
|
||||
//! poison_guard.disarm();
|
||||
//! # Ok::<(), utils::poison::Error>(())
|
||||
//! # });
|
||||
//! ```
|
||||
|
||||
use tracing::warn;
|
||||
|
||||
pub struct Poison<T> {
|
||||
what: &'static str,
|
||||
state: State,
|
||||
data: T,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum State {
|
||||
Clean,
|
||||
Armed,
|
||||
Poisoned { at: chrono::DateTime<chrono::Utc> },
|
||||
}
|
||||
|
||||
impl<T> Poison<T> {
|
||||
/// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
|
||||
pub fn new(what: &'static str, data: T) -> Self {
|
||||
Self {
|
||||
what,
|
||||
state: State::Clean,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
/// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
|
||||
pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
|
||||
match self.state {
|
||||
State::Clean => {
|
||||
self.state = State::Armed;
|
||||
Ok(Guard(self))
|
||||
}
|
||||
State::Armed => unreachable!("transient state"),
|
||||
State::Poisoned { at } => Err(Error::Poisoned {
|
||||
what: self.what,
|
||||
at,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
|
||||
/// Once modifications are done, use [`Self::disarm`].
|
||||
/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
|
||||
/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
|
||||
pub struct Guard<'a, T>(&'a mut Poison<T>);
|
||||
|
||||
impl<'a, T> Guard<'a, T> {
|
||||
pub fn data(&self) -> &T {
|
||||
&self.0.data
|
||||
}
|
||||
pub fn data_mut(&mut self) -> &mut T {
|
||||
&mut self.0.data
|
||||
}
|
||||
|
||||
pub fn disarm(self) {
|
||||
match self.0.state {
|
||||
State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
|
||||
State::Armed => {
|
||||
self.0.state = State::Clean;
|
||||
}
|
||||
State::Poisoned { at } => {
|
||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> Drop for Guard<'a, T> {
|
||||
fn drop(&mut self) {
|
||||
match self.0.state {
|
||||
State::Clean => {
|
||||
// set by disarm()
|
||||
}
|
||||
State::Armed => {
|
||||
// still armed => poison it
|
||||
let at = chrono::Utc::now();
|
||||
self.0.state = State::Poisoned { at };
|
||||
warn!(at=?at, "poisoning {}", self.0.what);
|
||||
}
|
||||
State::Poisoned { at } => {
|
||||
unreachable!("we fail check_and_arm() if it's in that state: {at}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub enum Error {
|
||||
#[error("poisoned at {at}: {what}")]
|
||||
Poisoned {
|
||||
what: &'static str,
|
||||
at: chrono::DateTime<chrono::Utc>,
|
||||
},
|
||||
}
|
||||
@@ -27,30 +27,50 @@
|
||||
//!
|
||||
//! # Reference Numbers
|
||||
//!
|
||||
//! 2024-04-04 on i3en.3xlarge
|
||||
//! 2024-04-15 on i3en.3xlarge
|
||||
//!
|
||||
//! ```text
|
||||
//! short/1 time: [25.925 µs 26.060 µs 26.209 µs]
|
||||
//! short/2 time: [31.277 µs 31.483 µs 31.722 µs]
|
||||
//! short/4 time: [45.496 µs 45.831 µs 46.182 µs]
|
||||
//! short/8 time: [84.298 µs 84.920 µs 85.566 µs]
|
||||
//! short/16 time: [185.04 µs 186.41 µs 187.88 µs]
|
||||
//! short/32 time: [385.01 µs 386.77 µs 388.70 µs]
|
||||
//! short/64 time: [770.24 µs 773.04 µs 776.04 µs]
|
||||
//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms]
|
||||
//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs]
|
||||
//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs]
|
||||
//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs]
|
||||
//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs]
|
||||
//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms]
|
||||
//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms]
|
||||
//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms]
|
||||
//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms]
|
||||
//! async-short/1 time: [24.584 µs 24.737 µs 24.922 µs]
|
||||
//! async-short/2 time: [33.479 µs 33.660 µs 33.888 µs]
|
||||
//! async-short/4 time: [42.713 µs 43.046 µs 43.440 µs]
|
||||
//! async-short/8 time: [71.814 µs 72.478 µs 73.240 µs]
|
||||
//! async-short/16 time: [132.73 µs 134.45 µs 136.22 µs]
|
||||
//! async-short/32 time: [258.31 µs 260.73 µs 263.27 µs]
|
||||
//! async-short/64 time: [511.61 µs 514.44 µs 517.51 µs]
|
||||
//! async-short/128 time: [992.64 µs 998.23 µs 1.0042 ms]
|
||||
//! async-medium/1 time: [110.11 µs 110.50 µs 110.96 µs]
|
||||
//! async-medium/2 time: [153.06 µs 153.85 µs 154.99 µs]
|
||||
//! async-medium/4 time: [317.51 µs 319.92 µs 322.85 µs]
|
||||
//! async-medium/8 time: [638.30 µs 644.68 µs 652.12 µs]
|
||||
//! async-medium/16 time: [1.2651 ms 1.2773 ms 1.2914 ms]
|
||||
//! async-medium/32 time: [2.5117 ms 2.5410 ms 2.5720 ms]
|
||||
//! async-medium/64 time: [4.8088 ms 4.8555 ms 4.9047 ms]
|
||||
//! async-medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! sync-short/1 time: [25.503 µs 25.626 µs 25.771 µs]
|
||||
//! sync-short/2 time: [30.850 µs 31.013 µs 31.208 µs]
|
||||
//! sync-short/4 time: [45.543 µs 45.856 µs 46.193 µs]
|
||||
//! sync-short/8 time: [84.114 µs 84.639 µs 85.220 µs]
|
||||
//! sync-short/16 time: [185.22 µs 186.15 µs 187.13 µs]
|
||||
//! sync-short/32 time: [377.43 µs 378.87 µs 380.46 µs]
|
||||
//! sync-short/64 time: [756.49 µs 759.04 µs 761.70 µs]
|
||||
//! sync-short/128 time: [1.4825 ms 1.4874 ms 1.4923 ms]
|
||||
//! sync-medium/1 time: [105.66 µs 106.01 µs 106.43 µs]
|
||||
//! sync-medium/2 time: [153.10 µs 153.84 µs 154.72 µs]
|
||||
//! sync-medium/4 time: [327.13 µs 329.44 µs 332.27 µs]
|
||||
//! sync-medium/8 time: [654.26 µs 658.73 µs 663.63 µs]
|
||||
//! sync-medium/16 time: [1.2682 ms 1.2748 ms 1.2816 ms]
|
||||
//! sync-medium/32 time: [2.4456 ms 2.4595 ms 2.4731 ms]
|
||||
//! sync-medium/64 time: [4.6523 ms 4.6890 ms 4.7256 ms]
|
||||
//! sync-medium/128 time: [8.7215 ms 8.8323 ms 8.9344 ms]
|
||||
//! ```
|
||||
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver::{
|
||||
config::PageServerConf,
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::{PostgresRedoManager, ProcessKind},
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
sync::Arc,
|
||||
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
|
||||
use utils::{id::TenantId, lsn::Lsn};
|
||||
|
||||
fn bench(c: &mut Criterion) {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("short");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-short"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::short_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group("medium");
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
|
||||
},
|
||||
);
|
||||
{
|
||||
let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
|
||||
for nclients in nclients {
|
||||
let mut group = c.benchmark_group(format!("{process_kind}-medium"));
|
||||
group.bench_with_input(
|
||||
BenchmarkId::from_parameter(nclients),
|
||||
&nclients,
|
||||
|b, nclients| {
|
||||
let redo_work = Arc::new(Request::medium_input());
|
||||
b.iter_custom(|iters| {
|
||||
bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
|
||||
});
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
|
||||
criterion::criterion_main!(benches);
|
||||
|
||||
// Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
|
||||
fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
|
||||
fn bench_impl(
|
||||
process_kind: ProcessKind,
|
||||
redo_work: Arc<Request>,
|
||||
n_redos: u64,
|
||||
nclients: u64,
|
||||
) -> Duration {
|
||||
let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
|
||||
conf.walredo_process_kind = process_kind;
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
|
||||
|
||||
@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
|
||||
let manager = PostgresRedoManager::new(conf, tenant_shard_id);
|
||||
let manager = Arc::new(manager);
|
||||
|
||||
// divide the amount of work equally among the clients.
|
||||
let nredos_per_client = n_redos / nclients;
|
||||
for _ in 0..nclients {
|
||||
rt.block_on(async {
|
||||
tasks.spawn(client(
|
||||
Arc::clone(&manager),
|
||||
Arc::clone(&start),
|
||||
Arc::clone(&redo_work),
|
||||
// divide the amount of work equally among the clients
|
||||
n_redos / nclients,
|
||||
nredos_per_client,
|
||||
))
|
||||
});
|
||||
}
|
||||
|
||||
rt.block_on(async move {
|
||||
let mut total_wallclock_time = std::time::Duration::from_millis(0);
|
||||
let elapsed = rt.block_on(async move {
|
||||
let mut total_wallclock_time = Duration::ZERO;
|
||||
while let Some(res) = tasks.join_next().await {
|
||||
total_wallclock_time += res.unwrap();
|
||||
}
|
||||
total_wallclock_time
|
||||
})
|
||||
});
|
||||
|
||||
// consistency check to ensure process kind setting worked
|
||||
if nredos_per_client > 0 {
|
||||
assert_eq!(
|
||||
manager
|
||||
.status()
|
||||
.process
|
||||
.map(|p| p.kind)
|
||||
.expect("the benchmark work causes a walredo process to be spawned"),
|
||||
std::borrow::Cow::Borrowed(process_kind.into())
|
||||
);
|
||||
}
|
||||
|
||||
elapsed
|
||||
}
|
||||
|
||||
async fn client(
|
||||
|
||||
@@ -285,6 +285,7 @@ fn start_pageserver(
|
||||
))
|
||||
.unwrap();
|
||||
pageserver::preinitialize_metrics();
|
||||
pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);
|
||||
|
||||
// If any failpoints were set from FAILPOINTS environment variable,
|
||||
// print them to the log for debugging purposes
|
||||
|
||||
@@ -97,6 +97,8 @@ pub mod defaults {
|
||||
|
||||
pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
|
||||
|
||||
pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
|
||||
|
||||
///
|
||||
/// Default built-in configuration file.
|
||||
///
|
||||
@@ -140,6 +142,8 @@ pub mod defaults {
|
||||
|
||||
#validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'
|
||||
|
||||
#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
|
||||
|
||||
[tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
|
||||
@@ -290,6 +294,8 @@ pub struct PageServerConf {
|
||||
///
|
||||
/// Setting this to zero disables limits on total ephemeral layer size.
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
|
||||
pub walredo_process_kind: crate::walredo::ProcessKind,
|
||||
}
|
||||
|
||||
/// We do not want to store this in a PageServerConf because the latter may be logged
|
||||
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
|
||||
validate_vectored_get: BuilderValue<bool>,
|
||||
|
||||
ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
|
||||
|
||||
walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
|
||||
}
|
||||
|
||||
impl PageServerConfigBuilder {
|
||||
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
|
||||
)),
|
||||
validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
|
||||
ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
|
||||
|
||||
walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
|
||||
self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
|
||||
self.walredo_process_kind = BuilderValue::Set(value);
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let default = Self::default_values();
|
||||
|
||||
@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
|
||||
max_vectored_read_bytes,
|
||||
validate_vectored_get,
|
||||
ephemeral_bytes_per_memory_kb,
|
||||
walredo_process_kind,
|
||||
}
|
||||
CUSTOM LOGIC
|
||||
{
|
||||
@@ -1032,6 +1047,9 @@ impl PageServerConf {
|
||||
"ephemeral_bytes_per_memory_kb" => {
|
||||
builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
|
||||
}
|
||||
"walredo_process_kind" => {
|
||||
builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
|
||||
}
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
}
|
||||
}
|
||||
@@ -1114,6 +1132,7 @@ impl PageServerConf {
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
|
||||
.expect("Invalid default constant")
|
||||
),
|
||||
validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
|
||||
ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
|
||||
walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
|
||||
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
|
||||
continue;
|
||||
}
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// We only send consumption metrics from shard 0, so don't waste time calculating
|
||||
// synthetic size on other shards.
|
||||
continue;
|
||||
|
||||
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
|
||||
};
|
||||
|
||||
let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
|
||||
if state != TenantState::Active || !id.is_zero() {
|
||||
if state != TenantState::Active || !id.is_shard_zero() {
|
||||
None
|
||||
} else {
|
||||
tenant_manager
|
||||
|
||||
@@ -58,24 +58,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: The reload completed successfully.
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error (also hits if no keys were found)
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}:
|
||||
parameters:
|
||||
@@ -93,62 +75,14 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenant not found
|
||||
description: Tenant not found. This is the success path.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -165,18 +99,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
||||
parameters:
|
||||
@@ -206,36 +128,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
@@ -255,36 +147,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
@@ -309,60 +171,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
delete:
|
||||
description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
|
||||
responses:
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found
|
||||
description: Timeline not found. This is the success path.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -379,18 +193,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
|
||||
parameters:
|
||||
@@ -423,36 +225,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: date-time
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Timeline not found, or there is no timestamp information for the given lsn
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
|
||||
parameters:
|
||||
@@ -484,36 +256,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/LsnByTimestampResponse"
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
|
||||
parameters:
|
||||
@@ -537,36 +279,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Error when no tenant id found in path, no timeline id or invalid timestamp
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
/v1/tenant/{tenant_shard_id}/location_config:
|
||||
parameters:
|
||||
- name: tenant_shard_id
|
||||
@@ -628,24 +340,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLocationConfigResponse"
|
||||
"503":
|
||||
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"409":
|
||||
description: |
|
||||
The tenant is already known to Pageserver in some way,
|
||||
@@ -662,12 +356,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -684,36 +372,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant ignored
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
@@ -740,36 +398,6 @@ paths:
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
@@ -790,37 +418,6 @@ paths:
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
"404":
|
||||
description: No tenant or timeline found for the specified ids
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/synthetic_size:
|
||||
parameters:
|
||||
@@ -839,31 +436,8 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SyntheticSizeResponse"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
# This route has no handler. TODO: remove?
|
||||
/v1/tenant/{tenant_id}/size:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -945,18 +519,6 @@ paths:
|
||||
responses:
|
||||
"200":
|
||||
description: Success
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_shard_id}/secondary/download:
|
||||
parameters:
|
||||
@@ -987,20 +549,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/SecondaryProgress"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -1043,24 +591,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TimelineInfo"
|
||||
"400":
|
||||
description: Malformed timeline create request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"406":
|
||||
description: Permanently unsatisfiable request, don't retry.
|
||||
content:
|
||||
@@ -1079,18 +609,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/:
|
||||
get:
|
||||
@@ -1104,30 +622,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
post:
|
||||
description: |
|
||||
@@ -1148,43 +642,12 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
type: string
|
||||
"400":
|
||||
description: Malformed tenant create request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"409":
|
||||
description: Tenant already exists, creation skipped
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
@@ -1206,36 +669,6 @@ paths:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Malformed tenant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/tenant/{tenant_id}/config/:
|
||||
parameters:
|
||||
@@ -1255,42 +688,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigResponse"
|
||||
"400":
|
||||
description: Malformed get tenanant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Tenand or timeline were not found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"503":
|
||||
description: Temporarily unavailable, please retry.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ServiceUnavailableError"
|
||||
|
||||
/v1/utilization:
|
||||
get:
|
||||
@@ -1304,12 +701,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PageserverUtilization"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
|
||||
@@ -457,8 +457,12 @@ async fn reload_auth_validation_keys_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
Err(e) => {
|
||||
let err_msg = "Error reloading public keys";
|
||||
warn!("Error reloading public keys from {key_path:?}: {e:}");
|
||||
json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
|
||||
json_response(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
HttpErrorBody::from_msg(err_msg.to_string()),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -696,7 +700,7 @@ async fn get_lsn_by_timestamp_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
@@ -747,7 +751,7 @@ async fn get_timestamp_of_lsn_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
// Requires SLRU contents, which are only stored on shard zero
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
@@ -772,7 +776,9 @@ async fn get_timestamp_of_lsn_handler(
|
||||
let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
|
||||
json_response(StatusCode::OK, time)
|
||||
}
|
||||
None => json_response(StatusCode::NOT_FOUND, ()),
|
||||
None => Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1086,7 +1092,7 @@ async fn tenant_size_handler(
|
||||
let headers = request.headers();
|
||||
let state = get_state(&request);
|
||||
|
||||
if !tenant_shard_id.is_zero() {
|
||||
if !tenant_shard_id.is_shard_zero() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Size calculations are only available on shard zero"
|
||||
)));
|
||||
|
||||
@@ -1819,6 +1819,29 @@ impl Default for WalRedoProcessCounters {
|
||||
pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
|
||||
Lazy::new(WalRedoProcessCounters::default);
|
||||
|
||||
#[cfg(not(test))]
|
||||
pub mod wal_redo {
|
||||
use super::*;
|
||||
|
||||
static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
|
||||
std::sync::Mutex::new(
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_wal_redo_process_kind",
|
||||
"The configured process kind for walredo",
|
||||
&["kind"],
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
});
|
||||
|
||||
pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
|
||||
// use guard to avoid races around the next two steps
|
||||
let guard = PROCESS_KIND.lock().unwrap();
|
||||
guard.reset();
|
||||
guard.with_label_values(&[&format!("{kind}")]).set(1);
|
||||
}
|
||||
}
|
||||
|
||||
/// Similar to `prometheus::HistogramTimer` but does not record on drop.
|
||||
pub(crate) struct StorageTimeMetricsTimer {
|
||||
metrics: StorageTimeMetrics,
|
||||
@@ -2089,7 +2112,7 @@ impl TimelineMetrics {
|
||||
|
||||
pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
|
||||
// Only shard zero deals in synthetic sizes
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
let tid = tenant_shard_id.tenant_id.to_string();
|
||||
let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
|
||||
}
|
||||
|
||||
@@ -386,7 +386,7 @@ impl WalRedoManager {
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
match self {
|
||||
WalRedoManager::Prod(m) => m.status(),
|
||||
WalRedoManager::Prod(m) => Some(m.status()),
|
||||
#[cfg(test)]
|
||||
WalRedoManager::Test(_) => None,
|
||||
}
|
||||
@@ -3190,7 +3190,7 @@ impl Tenant {
|
||||
run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
|
||||
|
||||
// Upload the created data dir to S3
|
||||
if self.tenant_shard_id().is_zero() {
|
||||
if self.tenant_shard_id().is_shard_zero() {
|
||||
self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
|
||||
.await?;
|
||||
}
|
||||
@@ -3437,7 +3437,7 @@ impl Tenant {
|
||||
.store(size, Ordering::Relaxed);
|
||||
|
||||
// Only shard zero should be calculating synthetic sizes
|
||||
debug_assert!(self.shard_identity.is_zero());
|
||||
debug_assert!(self.shard_identity.is_shard_zero());
|
||||
|
||||
TENANT_SYNTHETIC_SIZE_METRIC
|
||||
.get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
|
||||
|
||||
@@ -121,7 +121,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
|
||||
self.offset
|
||||
}
|
||||
|
||||
const CAPACITY: usize = if BUFFERED { PAGE_SZ } else { 0 };
|
||||
const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
|
||||
|
||||
/// Writes the given buffer directly to the underlying `VirtualFile`.
|
||||
/// You need to make sure that the internal buffer is empty, otherwise
|
||||
|
||||
@@ -436,6 +436,11 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Check whether background deletion of this tenant is currently in progress
|
||||
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
|
||||
tenant.delete_progress.try_lock().is_err()
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
|
||||
@@ -3,36 +3,27 @@
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
|
||||
use crate::virtual_file::{self, VirtualFile};
|
||||
use bytes::BytesMut;
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::cmp::min;
|
||||
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::io;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
use tracing::*;
|
||||
use utils::id::TimelineId;
|
||||
|
||||
pub struct EphemeralFile {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
|
||||
_tenant_shard_id: TenantShardId,
|
||||
_timeline_id: TimelineId,
|
||||
file: VirtualFile,
|
||||
len: u64,
|
||||
/// An ephemeral file is append-only.
|
||||
/// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
|
||||
/// The other pages, which can no longer be modified, are accessed through the page cache.
|
||||
///
|
||||
/// None <=> IO is ongoing.
|
||||
/// Size is fixed to PAGE_SZ at creation time and must not be changed.
|
||||
mutable_tail: Option<BytesMut>,
|
||||
|
||||
rw: page_caching::RW,
|
||||
}
|
||||
|
||||
mod page_caching;
|
||||
mod zero_padded_buffer;
|
||||
mod zero_padded_read_write;
|
||||
|
||||
impl EphemeralFile {
|
||||
pub async fn create(
|
||||
conf: &PageServerConf,
|
||||
@@ -59,21 +50,18 @@ impl EphemeralFile {
|
||||
.await?;
|
||||
|
||||
Ok(EphemeralFile {
|
||||
page_cache_file_id: page_cache::next_file_id(),
|
||||
_tenant_shard_id: tenant_shard_id,
|
||||
_timeline_id: timeline_id,
|
||||
file,
|
||||
len: 0,
|
||||
mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
|
||||
rw: page_caching::RW::new(file),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> u64 {
|
||||
self.len
|
||||
self.rw.bytes_written()
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> page_cache::FileId {
|
||||
self.page_cache_file_id
|
||||
pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||
self.rw.page_cache_file_id()
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
@@ -81,182 +69,30 @@ impl EphemeralFile {
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
let flushed_blknums = 0..self.len / PAGE_SZ as u64;
|
||||
if flushed_blknums.contains(&(blknum as u64)) {
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum, self.file.path, e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = self
|
||||
.file
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
}
|
||||
};
|
||||
} else {
|
||||
debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
|
||||
Ok(BlockLease::EphemeralFileMutableTail(
|
||||
self.mutable_tail
|
||||
.as_deref()
|
||||
.expect("we're not doing IO, it must be Some()")
|
||||
.try_into()
|
||||
.expect("we ensure that it's always PAGE_SZ"),
|
||||
))
|
||||
}
|
||||
self.rw.read_blk(blknum, ctx).await
|
||||
}
|
||||
|
||||
pub(crate) async fn write_blob(
|
||||
&mut self,
|
||||
srcbuf: &[u8],
|
||||
ctx: &RequestContext,
|
||||
_ctx: &RequestContext,
|
||||
) -> Result<u64, io::Error> {
|
||||
struct Writer<'a> {
|
||||
ephemeral_file: &'a mut EphemeralFile,
|
||||
/// The block to which the next [`push_bytes`] will write.
|
||||
blknum: u32,
|
||||
/// The offset inside the block identified by [`blknum`] to which [`push_bytes`] will write.
|
||||
off: usize,
|
||||
}
|
||||
impl<'a> Writer<'a> {
|
||||
fn new(ephemeral_file: &'a mut EphemeralFile) -> io::Result<Writer<'a>> {
|
||||
Ok(Writer {
|
||||
blknum: (ephemeral_file.len / PAGE_SZ as u64) as u32,
|
||||
off: (ephemeral_file.len % PAGE_SZ as u64) as usize,
|
||||
ephemeral_file,
|
||||
})
|
||||
}
|
||||
#[inline(always)]
|
||||
async fn push_bytes(
|
||||
&mut self,
|
||||
src: &[u8],
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), io::Error> {
|
||||
let mut src_remaining = src;
|
||||
while !src_remaining.is_empty() {
|
||||
let dst_remaining = &mut self
|
||||
.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not yet ongoing")[self.off..];
|
||||
let n = min(dst_remaining.len(), src_remaining.len());
|
||||
dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
|
||||
self.off += n;
|
||||
src_remaining = &src_remaining[n..];
|
||||
if self.off == PAGE_SZ {
|
||||
let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
|
||||
.expect("IO is not yet ongoing");
|
||||
let (mutable_tail, res) = self
|
||||
.ephemeral_file
|
||||
.file
|
||||
.write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
|
||||
.await;
|
||||
// TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
|
||||
// I.e., the IO isn't retryable if we panic.
|
||||
self.ephemeral_file.mutable_tail = Some(mutable_tail);
|
||||
match res {
|
||||
Ok(_) => {
|
||||
// Pre-warm the page cache with what we just wrote.
|
||||
// This isn't necessary for coherency/correctness, but it's how we've always done it.
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
.read_immutable_buf(
|
||||
self.ephemeral_file.page_cache_file_id,
|
||||
self.blknum,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(page_cache::ReadBufResult::Found(_guard)) => {
|
||||
// This function takes &mut self, so, it shouldn't be possible to reach this point.
|
||||
unreachable!("we just wrote blknum {} and this function takes &mut self, so, no concurrent read_blk is possible", self.blknum);
|
||||
}
|
||||
Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
|
||||
let buf: &mut [u8] = write_guard.deref_mut();
|
||||
debug_assert_eq!(buf.len(), PAGE_SZ);
|
||||
buf.copy_from_slice(
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref()
|
||||
.expect("IO is not ongoing"),
|
||||
);
|
||||
let _ = write_guard.mark_valid();
|
||||
// pre-warm successful
|
||||
}
|
||||
Err(e) => {
|
||||
error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
|
||||
// fail gracefully, it's not the end of the world if we can't pre-warm the cache here
|
||||
}
|
||||
}
|
||||
// Zero the buffer for re-use.
|
||||
// Zeroing is critical for correcntess because the write_blob code below
|
||||
// and similarly read_blk expect zeroed pages.
|
||||
self.ephemeral_file
|
||||
.mutable_tail
|
||||
.as_deref_mut()
|
||||
.expect("IO is not ongoing")
|
||||
.fill(0);
|
||||
// This block is done, move to next one.
|
||||
self.blknum += 1;
|
||||
self.off = 0;
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
// order error before path because path is long and error is short
|
||||
format!(
|
||||
"ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
|
||||
self.blknum,
|
||||
e,
|
||||
self.ephemeral_file.file.path,
|
||||
),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
let pos = self.len;
|
||||
let mut writer = Writer::new(self)?;
|
||||
let pos = self.rw.bytes_written();
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
// short one-byte length header
|
||||
let len_buf = [srcbuf.len() as u8];
|
||||
writer.push_bytes(&len_buf, ctx).await?;
|
||||
|
||||
self.rw.write_all_borrowed(&len_buf).await?;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
writer.push_bytes(&len_buf, ctx).await?;
|
||||
self.rw.write_all_borrowed(&len_buf).await?;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
writer.push_bytes(srcbuf, ctx).await?;
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.len += 1;
|
||||
} else {
|
||||
self.len += 4;
|
||||
}
|
||||
self.len += srcbuf.len() as u64;
|
||||
self.rw.write_all_borrowed(srcbuf).await?;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
@@ -271,28 +107,6 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// There might still be pages in the [`crate::page_cache`] for this file.
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path, e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BlockReader for EphemeralFile {
|
||||
fn block_cursor(&self) -> super::block_io::BlockCursor<'_> {
|
||||
BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self))
|
||||
|
||||
105
pageserver/src/tenant/ephemeral_file/page_caching.rs
Normal file
105
pageserver/src/tenant/ephemeral_file/page_caching.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the
|
||||
//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`].
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::page_cache::{self, PAGE_SZ};
|
||||
use crate::tenant::block_io::BlockLease;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
|
||||
use std::io;
|
||||
use tracing::*;
|
||||
|
||||
use super::zero_padded_read_write;
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct RW {
|
||||
page_cache_file_id: page_cache::FileId,
|
||||
rw: super::zero_padded_read_write::RW,
|
||||
}
|
||||
|
||||
impl RW {
|
||||
pub fn new(file: VirtualFile) -> Self {
|
||||
Self {
|
||||
page_cache_file_id: page_cache::next_file_id(),
|
||||
rw: super::zero_padded_read_write::RW::new(file),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn page_cache_file_id(&self) -> page_cache::FileId {
|
||||
self.page_cache_file_id
|
||||
}
|
||||
|
||||
pub(crate) async fn write_all_borrowed(&mut self, srcbuf: &[u8]) -> Result<usize, io::Error> {
|
||||
// It doesn't make sense to proactively fill the page cache on the Pageserver write path
|
||||
// because Compute is unlikely to access recently written data.
|
||||
self.rw.write_all_borrowed(srcbuf).await
|
||||
}
|
||||
|
||||
pub(crate) fn bytes_written(&self) -> u64 {
|
||||
self.rw.bytes_written()
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(
|
||||
&self,
|
||||
blknum: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<BlockLease, io::Error> {
|
||||
match self.rw.read_blk(blknum).await? {
|
||||
zero_padded_read_write::ReadResult::NeedsReadFromVirtualFile { virtual_file } => {
|
||||
let cache = page_cache::get();
|
||||
match cache
|
||||
.read_immutable_buf(self.page_cache_file_id, blknum, ctx)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
// order path before error because error is anyhow::Error => might have many contexts
|
||||
format!(
|
||||
"ephemeral file: read immutable page #{}: {}: {:#}",
|
||||
blknum,
|
||||
self.rw.as_inner_virtual_file().path,
|
||||
e,
|
||||
),
|
||||
)
|
||||
})? {
|
||||
page_cache::ReadBufResult::Found(guard) => {
|
||||
return Ok(BlockLease::PageReadGuard(guard))
|
||||
}
|
||||
page_cache::ReadBufResult::NotFound(write_guard) => {
|
||||
let write_guard = virtual_file
|
||||
.read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64)
|
||||
.await?;
|
||||
let read_guard = write_guard.mark_valid();
|
||||
return Ok(BlockLease::PageReadGuard(read_guard));
|
||||
}
|
||||
}
|
||||
}
|
||||
zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => {
|
||||
Ok(BlockLease::EphemeralFileMutableTail(buffer))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for RW {
|
||||
fn drop(&mut self) {
|
||||
// There might still be pages in the [`crate::page_cache`] for this file.
|
||||
// We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed.
|
||||
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.rw.as_inner_virtual_file().path);
|
||||
if let Err(e) = res {
|
||||
if e.kind() != std::io::ErrorKind::NotFound {
|
||||
// just never log the not found errors, we cannot do anything for them; on detach
|
||||
// the tenant directory is already gone.
|
||||
//
|
||||
// not found files might also be related to https://github.com/neondatabase/neon/issues/2442
|
||||
error!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.rw.as_inner_virtual_file().path,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
124
pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
Normal file
124
pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
//! The heart of how [`super::EphemeralFile`] does its reads and writes.
|
||||
//!
|
||||
//! # Writes
|
||||
//!
|
||||
//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`].
|
||||
//! The [`RW`] batches these into [`RW::TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`].
|
||||
//!
|
||||
//! # Reads
|
||||
//!
|
||||
//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`].
|
||||
//!
|
||||
//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer
|
||||
//! or redirects the caller to read from the underlying [`VirtualFile`]` if they have already
|
||||
//! been flushed.
|
||||
//!
|
||||
//! The current caller is [`super::page_caching::RW`]. In case it gets redirected to read from
|
||||
//! [`VirtualFile`], it consults the [`crate::page_cache`] first.
|
||||
|
||||
mod zero_padded_buffer;
|
||||
|
||||
use crate::{
|
||||
page_cache::PAGE_SZ,
|
||||
tenant::ephemeral_file::zero_padded_buffer,
|
||||
virtual_file::{
|
||||
owned_buffers_io::{self, write::Buffer},
|
||||
VirtualFile,
|
||||
},
|
||||
};
|
||||
|
||||
/// See module-level comment.
|
||||
pub struct RW {
|
||||
buffered_writer: owned_buffers_io::write::BufferedWriter<
|
||||
zero_padded_buffer::Buf<{ Self::TAIL_SZ }>,
|
||||
owned_buffers_io::util::size_tracking_writer::Writer<VirtualFile>,
|
||||
>,
|
||||
}
|
||||
|
||||
pub enum ReadResult<'a> {
|
||||
NeedsReadFromVirtualFile { virtual_file: &'a VirtualFile },
|
||||
ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] },
|
||||
}
|
||||
|
||||
impl RW {
|
||||
const TAIL_SZ: usize = 64 * 1024;
|
||||
|
||||
pub fn new(file: VirtualFile) -> Self {
|
||||
let bytes_flushed_tracker = owned_buffers_io::util::size_tracking_writer::Writer::new(file);
|
||||
let buffered_writer = owned_buffers_io::write::BufferedWriter::new(
|
||||
bytes_flushed_tracker,
|
||||
zero_padded_buffer::Buf::default(),
|
||||
);
|
||||
Self { buffered_writer }
|
||||
}
|
||||
|
||||
pub(crate) fn as_inner_virtual_file(&self) -> &VirtualFile {
|
||||
self.buffered_writer.as_inner().as_inner()
|
||||
}
|
||||
|
||||
pub async fn write_all_borrowed(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
||||
self.buffered_writer.write_buffered_borrowed(buf).await
|
||||
}
|
||||
|
||||
pub fn bytes_written(&self) -> u64 {
|
||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||
let buffer: &zero_padded_buffer::Buf<{ Self::TAIL_SZ }> =
|
||||
self.buffered_writer.inspect_buffer();
|
||||
flushed_offset + u64::try_from(buffer.pending()).unwrap()
|
||||
}
|
||||
|
||||
pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult, std::io::Error> {
|
||||
let flushed_offset = self.buffered_writer.as_inner().bytes_written();
|
||||
let buffer: &zero_padded_buffer::Buf<{ Self::TAIL_SZ }> =
|
||||
self.buffered_writer.inspect_buffer();
|
||||
let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap();
|
||||
let read_offset = (blknum as u64) * (PAGE_SZ as u64);
|
||||
|
||||
assert_eq!(
|
||||
flushed_offset % (Self::TAIL_SZ as u64), 0,
|
||||
"we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks"
|
||||
);
|
||||
assert_eq!(
|
||||
flushed_offset % (PAGE_SZ as u64),
|
||||
0,
|
||||
"the logic below can't handle if the page is spread across the flushed part and the buffer"
|
||||
);
|
||||
|
||||
if read_offset < flushed_offset {
|
||||
assert!(
|
||||
read_offset + (PAGE_SZ as u64) <= flushed_offset,
|
||||
"this impl can't deal with pages spread across flushed & buffered part"
|
||||
);
|
||||
Ok(ReadResult::NeedsReadFromVirtualFile {
|
||||
virtual_file: self.as_inner_virtual_file(),
|
||||
})
|
||||
} else {
|
||||
let read_until_offset = read_offset + (PAGE_SZ as u64);
|
||||
if !(0..buffered_offset).contains(&read_until_offset) {
|
||||
// The blob_io code relies on the reader allowing reads past
|
||||
// the end of what was written, up to end of the current PAGE_SZ chunk.
|
||||
// This is a relict of the past where we would get a pre-zeroed page from the page cache.
|
||||
//
|
||||
// DeltaLayer probably has the same issue, not sure why it needs no special treatment.
|
||||
let nbytes_past_end = read_until_offset.checked_sub(buffered_offset).unwrap();
|
||||
if nbytes_past_end >= (PAGE_SZ as u64) {
|
||||
// TODO: treat this as error. Pre-existing issue before this patch.
|
||||
panic!(
|
||||
"return IO error: read past end of file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"
|
||||
)
|
||||
}
|
||||
}
|
||||
let read_offset_in_buffer = read_offset
|
||||
.checked_sub(flushed_offset)
|
||||
.expect("would have taken `if` branch instead of this one");
|
||||
let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap();
|
||||
let zero_padded_slice = buffer.as_zero_padded_slice();
|
||||
let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)];
|
||||
Ok(ReadResult::ServedFromZeroPaddedMutableTail {
|
||||
buffer: page
|
||||
.try_into()
|
||||
.expect("the slice above got it as page-size slice"),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
use std::mem::MaybeUninit;
|
||||
|
||||
pub struct Buf<const N: usize> {
|
||||
allocation: Box<[u8; N]>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl<const N: usize> Default for Buf<N> {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
allocation: Box::new(
|
||||
// SAFETY: zeroed memory is a valid [u8; N]
|
||||
unsafe { MaybeUninit::zeroed().assume_init() },
|
||||
),
|
||||
written: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> Buf<N> {
|
||||
#[inline(always)]
|
||||
fn invariants(&self) {
|
||||
// don't check by default, unoptimized is too expensive even for debug mode
|
||||
if false {
|
||||
debug_assert!(self.written <= N, "{}", self.written);
|
||||
debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0));
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_zero_padded_slice(&self) -> &[u8; N] {
|
||||
&self.allocation
|
||||
}
|
||||
}
|
||||
|
||||
impl<const N: usize> crate::virtual_file::owned_buffers_io::write::Buffer for Buf<N> {
|
||||
type IoBuf = Self;
|
||||
|
||||
fn cap(&self) -> usize {
|
||||
self.allocation.len()
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, other: &[u8]) {
|
||||
self.invariants();
|
||||
let remaining = self.cap() - other.len();
|
||||
if other.len() > remaining {
|
||||
panic!("calling extend_from_slice() with insufficient remaining capacity");
|
||||
}
|
||||
self.allocation[self.written..(self.written + other.len())].copy_from_slice(other);
|
||||
self.written += other.len();
|
||||
self.invariants();
|
||||
}
|
||||
|
||||
fn pending(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
|
||||
fn flush(self) -> tokio_epoll_uring::Slice<Self> {
|
||||
self.invariants();
|
||||
let written = self.written;
|
||||
tokio_epoll_uring::BoundedBuf::slice(self, 0..written)
|
||||
}
|
||||
|
||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
|
||||
let Self {
|
||||
mut allocation,
|
||||
written,
|
||||
} = iobuf;
|
||||
allocation[0..written].fill(0);
|
||||
let new = Self {
|
||||
allocation,
|
||||
written: 0,
|
||||
};
|
||||
new.invariants();
|
||||
new
|
||||
}
|
||||
}
|
||||
|
||||
/// We have this implementation so that [`Buf<N>`] can be used with [`crate::virtual_file::owned_buffers_io::write::BufferedWriter`].
|
||||
///
|
||||
///
|
||||
/// SAFETY:
|
||||
///
|
||||
/// The [`Self::allocation`] is stable becauses boxes are stable.
|
||||
/// The memory is zero-initialized, so, bytes_init is always N.
|
||||
///
|
||||
unsafe impl<const N: usize> tokio_epoll_uring::IoBuf for Buf<N> {
|
||||
fn stable_ptr(&self) -> *const u8 {
|
||||
self.allocation.as_ptr()
|
||||
}
|
||||
|
||||
fn bytes_init(&self) -> usize {
|
||||
N
|
||||
}
|
||||
|
||||
fn bytes_total(&self) -> usize {
|
||||
N
|
||||
}
|
||||
}
|
||||
@@ -1410,9 +1410,15 @@ impl TenantManager {
|
||||
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If a tenant is broken or stopping, DeleteTenantFlow can
|
||||
// handle it: broken tenants proceed to delete, stopping tenants
|
||||
// are checked for deletion already in progress.
|
||||
// If deletion is already in progress, return success (the semantics of this
|
||||
// function are to rerturn success afterr deletion is spawned in background).
|
||||
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
|
||||
if DeleteTenantFlow::is_in_progress(&tenant) {
|
||||
// The `delete_progress` lock is held: deletion is already happening
|
||||
// in the bacckground
|
||||
slot_guard.revert();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
tenant
|
||||
|
||||
@@ -7,6 +7,7 @@ use std::collections::HashSet;
|
||||
use std::future::Future;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bytes::BytesMut;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use tokio::fs::{self, File, OpenOptions};
|
||||
@@ -194,10 +195,10 @@ async fn download_object<'a>(
|
||||
// There's chunks_vectored() on the stream.
|
||||
let (bytes_amount, destination_file) = async {
|
||||
let size_tracking = size_tracking_writer::Writer::new(destination_file);
|
||||
let mut buffered = owned_buffers_io::write::BufferedWriter::<
|
||||
{ super::BUFFER_SIZE },
|
||||
_,
|
||||
>::new(size_tracking);
|
||||
let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
|
||||
size_tracking,
|
||||
BytesMut::with_capacity(super::BUFFER_SIZE),
|
||||
);
|
||||
while let Some(res) =
|
||||
futures::StreamExt::next(&mut download.download_stream).await
|
||||
{
|
||||
|
||||
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
|
||||
let warn_after = 3;
|
||||
let max_attempts = 10;
|
||||
let mut prefixes = Vec::with_capacity(2);
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
// Also recover the unsharded prefix for a shard of zero:
|
||||
// - if the tenant is totally unsharded, the unsharded prefix contains all the data
|
||||
// - if the tenant is sharded, we still want to recover the initdb data, but we only
|
||||
|
||||
@@ -939,7 +939,7 @@ impl DeltaLayerInner {
|
||||
}
|
||||
|
||||
if !range_end_handled {
|
||||
tracing::info!("Handling range end fallback at {}", data_end_offset);
|
||||
tracing::debug!("Handling range end fallback at {}", data_end_offset);
|
||||
planner.handle_range_end(data_end_offset);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -454,7 +454,7 @@ impl InMemoryLayer {
|
||||
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
|
||||
|
||||
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
|
||||
let key = InMemoryLayerFileId(file.id());
|
||||
let key = InMemoryLayerFileId(file.page_cache_file_id());
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
file_id: key,
|
||||
|
||||
@@ -1344,7 +1344,7 @@ impl Timeline {
|
||||
background_jobs_can_start: Option<&completion::Barrier>,
|
||||
ctx: &RequestContext,
|
||||
) {
|
||||
if self.tenant_shard_id.is_zero() {
|
||||
if self.tenant_shard_id.is_shard_zero() {
|
||||
// Logical size is only maintained accurately on shard zero.
|
||||
self.spawn_initial_logical_size_computation_task(ctx);
|
||||
}
|
||||
@@ -2237,7 +2237,7 @@ impl Timeline {
|
||||
priority: GetLogicalSizePriority,
|
||||
ctx: &RequestContext,
|
||||
) -> logical_size::CurrentLogicalSize {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
// Logical size is only accurately maintained on shard zero: when called elsewhere, for example
|
||||
// when HTTP API is serving a GET for timeline zero, return zero
|
||||
return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
|
||||
@@ -2533,7 +2533,7 @@ impl Timeline {
|
||||
crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
// We should never be calculating logical sizes on shard !=0, because these shards do not have
|
||||
// accurate relation sizes, and they do not emit consumption metrics.
|
||||
debug_assert!(self.tenant_shard_id.is_zero());
|
||||
debug_assert!(self.tenant_shard_id.is_shard_zero());
|
||||
|
||||
let guard = self
|
||||
.gate
|
||||
|
||||
@@ -378,7 +378,7 @@ impl Timeline {
|
||||
gate: &GateGuard,
|
||||
ctx: &RequestContext,
|
||||
) -> ControlFlow<()> {
|
||||
if !self.tenant_shard_id.is_zero() {
|
||||
if !self.tenant_shard_id.is_shard_zero() {
|
||||
// Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
|
||||
// for consumption metrics (consumption metrics are only sent from shard 0). We may therefore
|
||||
// skip imitating logical size accesses for eviction purposes.
|
||||
|
||||
@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(
|
||||
|
||||
// Send the replication feedback message.
|
||||
// Regular standby_status_update fields are put into this message.
|
||||
let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
|
||||
let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
|
||||
timeline
|
||||
.get_current_logical_size(
|
||||
crate::tenant::timeline::GetLogicalSizePriority::User,
|
||||
|
||||
@@ -36,7 +36,8 @@ pub(crate) use io_engine::IoEngineKind;
|
||||
pub(crate) use metadata::Metadata;
|
||||
pub(crate) use open_options::*;
|
||||
|
||||
#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
|
||||
use self::owned_buffers_io::write::OwnedAsyncWriter;
|
||||
|
||||
pub(crate) mod owned_buffers_io {
|
||||
//! Abstractions for IO with owned buffers.
|
||||
//!
|
||||
@@ -1083,6 +1084,17 @@ impl Drop for VirtualFile {
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for VirtualFile {
|
||||
#[inline(always)]
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let (buf, res) = VirtualFile::write_all(self, buf).await;
|
||||
res.map(move |v| (v, buf))
|
||||
}
|
||||
}
|
||||
|
||||
impl OpenFiles {
|
||||
fn new(num_slots: usize) -> OpenFiles {
|
||||
let mut slots = Box::new(Vec::with_capacity(num_slots));
|
||||
|
||||
@@ -1,33 +1,41 @@
|
||||
use crate::virtual_file::{owned_buffers_io::write::OwnedAsyncWriter, VirtualFile};
|
||||
use crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter;
|
||||
use tokio_epoll_uring::{BoundedBuf, IoBuf};
|
||||
|
||||
pub struct Writer {
|
||||
dst: VirtualFile,
|
||||
pub struct Writer<W> {
|
||||
dst: W,
|
||||
bytes_amount: u64,
|
||||
}
|
||||
|
||||
impl Writer {
|
||||
pub fn new(dst: VirtualFile) -> Self {
|
||||
impl<W> Writer<W> {
|
||||
pub fn new(dst: W) -> Self {
|
||||
Self {
|
||||
dst,
|
||||
bytes_amount: 0,
|
||||
}
|
||||
}
|
||||
pub fn bytes_written(&self) -> u64 {
|
||||
self.bytes_amount
|
||||
}
|
||||
pub fn as_inner(&self) -> &W {
|
||||
&self.dst
|
||||
}
|
||||
/// Returns the wrapped `VirtualFile` object as well as the number
|
||||
/// of bytes that were written to it through this object.
|
||||
pub fn into_inner(self) -> (u64, VirtualFile) {
|
||||
pub fn into_inner(self) -> (u64, W) {
|
||||
(self.bytes_amount, self.dst)
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Writer {
|
||||
impl<W> OwnedAsyncWriter for Writer<W>
|
||||
where
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
#[inline(always)]
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
buf: B,
|
||||
) -> std::io::Result<(usize, B::Buf)> {
|
||||
let (buf, res) = self.dst.write_all(buf).await;
|
||||
let nwritten = res?;
|
||||
let (nwritten, buf) = self.dst.write_all(buf).await?;
|
||||
self.bytes_amount += u64::try_from(nwritten).unwrap();
|
||||
Ok((nwritten, buf))
|
||||
}
|
||||
|
||||
@@ -10,14 +10,14 @@ pub trait OwnedAsyncWriter {
|
||||
) -> std::io::Result<(usize, B::Buf)>;
|
||||
}
|
||||
|
||||
/// A wrapper aorund an [`OwnedAsyncWriter`] that batches smaller writers
|
||||
/// into `BUFFER_SIZE`-sized writes.
|
||||
/// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
|
||||
/// small writes into larger writes of size [`Buffer::cap`].
|
||||
///
|
||||
/// # Passthrough Of Large Writers
|
||||
///
|
||||
/// Buffered writes larger than the `BUFFER_SIZE` cause the internal
|
||||
/// buffer to be flushed, even if it is not full yet. Then, the large
|
||||
/// buffered write is passed through to the unerlying [`OwnedAsyncWriter`].
|
||||
/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
|
||||
/// cause the internal buffer to be flushed prematurely so that the large
|
||||
/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
|
||||
///
|
||||
/// This pass-through is generally beneficial for throughput, but if
|
||||
/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
|
||||
@@ -25,27 +25,37 @@ pub trait OwnedAsyncWriter {
|
||||
///
|
||||
/// In such cases, a different implementation that always buffers in memory
|
||||
/// may be preferable.
|
||||
pub struct BufferedWriter<const BUFFER_SIZE: usize, W> {
|
||||
pub struct BufferedWriter<B, W> {
|
||||
writer: W,
|
||||
// invariant: always remains Some(buf)
|
||||
// with buf.capacity() == BUFFER_SIZE except
|
||||
// - while IO is ongoing => goes back to Some() once the IO completed successfully
|
||||
// - after an IO error => stays `None` forever
|
||||
// In these exceptional cases, it's `None`.
|
||||
buf: Option<BytesMut>,
|
||||
/// invariant: always remains Some(buf) except
|
||||
/// - while IO is ongoing => goes back to Some() once the IO completed successfully
|
||||
/// - after an IO error => stays `None` forever
|
||||
/// In these exceptional cases, it's `None`.
|
||||
buf: Option<B>,
|
||||
}
|
||||
|
||||
impl<const BUFFER_SIZE: usize, W> BufferedWriter<BUFFER_SIZE, W>
|
||||
impl<B, Buf, W> BufferedWriter<B, W>
|
||||
where
|
||||
B: Buffer<IoBuf = Buf> + Send,
|
||||
Buf: IoBuf + Send,
|
||||
W: OwnedAsyncWriter,
|
||||
{
|
||||
pub fn new(writer: W) -> Self {
|
||||
pub fn new(writer: W, buf: B) -> Self {
|
||||
Self {
|
||||
writer,
|
||||
buf: Some(BytesMut::with_capacity(BUFFER_SIZE)),
|
||||
buf: Some(buf),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_inner(&self) -> &W {
|
||||
&self.writer
|
||||
}
|
||||
|
||||
/// Panics if used after any of the write paths returned an error
|
||||
pub fn inspect_buffer(&self) -> &B {
|
||||
self.buf()
|
||||
}
|
||||
|
||||
pub async fn flush_and_into_inner(mut self) -> std::io::Result<W> {
|
||||
self.flush().await?;
|
||||
let Self { buf, writer } = self;
|
||||
@@ -53,61 +63,143 @@ where
|
||||
Ok(writer)
|
||||
}
|
||||
|
||||
pub async fn write_buffered<B: IoBuf>(&mut self, chunk: Slice<B>) -> std::io::Result<()>
|
||||
#[inline(always)]
|
||||
fn buf(&self) -> &B {
|
||||
self.buf
|
||||
.as_ref()
|
||||
.expect("must not use after we returned an error")
|
||||
}
|
||||
|
||||
pub async fn write_buffered<S: IoBuf>(&mut self, chunk: Slice<S>) -> std::io::Result<(usize, S)>
|
||||
where
|
||||
B: IoBuf + Send,
|
||||
S: IoBuf + Send,
|
||||
{
|
||||
let chunk_len = chunk.len();
|
||||
// avoid memcpy for the middle of the chunk
|
||||
if chunk.len() >= BUFFER_SIZE {
|
||||
if chunk.len() >= self.buf().cap() {
|
||||
self.flush().await?;
|
||||
// do a big write, bypassing `buf`
|
||||
assert_eq!(
|
||||
self.buf
|
||||
.as_ref()
|
||||
.expect("must not use after an error")
|
||||
.len(),
|
||||
.pending(),
|
||||
0
|
||||
);
|
||||
let chunk_len = chunk.len();
|
||||
let (nwritten, chunk) = self.writer.write_all(chunk).await?;
|
||||
assert_eq!(nwritten, chunk_len);
|
||||
drop(chunk);
|
||||
return Ok(());
|
||||
return Ok((nwritten, chunk));
|
||||
}
|
||||
// in-memory copy the < BUFFER_SIZED tail of the chunk
|
||||
assert!(chunk.len() < BUFFER_SIZE);
|
||||
let mut chunk = &chunk[..];
|
||||
assert!(chunk.len() < self.buf().cap());
|
||||
let mut slice = &chunk[..];
|
||||
while !slice.is_empty() {
|
||||
let buf = self.buf.as_mut().expect("must not use after an error");
|
||||
let need = buf.cap() - buf.pending();
|
||||
let have = slice.len();
|
||||
let n = std::cmp::min(need, have);
|
||||
buf.extend_from_slice(&slice[..n]);
|
||||
slice = &slice[n..];
|
||||
if buf.pending() >= buf.cap() {
|
||||
assert_eq!(buf.pending(), buf.cap());
|
||||
self.flush().await?;
|
||||
}
|
||||
}
|
||||
assert!(slice.is_empty(), "by now we should have drained the chunk");
|
||||
Ok((chunk_len, chunk.into_inner()))
|
||||
}
|
||||
|
||||
/// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
|
||||
///
|
||||
/// It is less performant because we always have to copy the borrowed data into the internal buffer
|
||||
/// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
|
||||
/// for large writes.
|
||||
pub async fn write_buffered_borrowed(&mut self, mut chunk: &[u8]) -> std::io::Result<usize> {
|
||||
let chunk_len = chunk.len();
|
||||
while !chunk.is_empty() {
|
||||
let buf = self.buf.as_mut().expect("must not use after an error");
|
||||
let need = BUFFER_SIZE - buf.len();
|
||||
let need = buf.cap() - buf.pending();
|
||||
let have = chunk.len();
|
||||
let n = std::cmp::min(need, have);
|
||||
buf.extend_from_slice(&chunk[..n]);
|
||||
chunk = &chunk[n..];
|
||||
if buf.len() >= BUFFER_SIZE {
|
||||
assert_eq!(buf.len(), BUFFER_SIZE);
|
||||
if buf.pending() >= buf.cap() {
|
||||
assert_eq!(buf.pending(), buf.cap());
|
||||
self.flush().await?;
|
||||
}
|
||||
}
|
||||
assert!(chunk.is_empty(), "by now we should have drained the chunk");
|
||||
Ok(())
|
||||
Ok(chunk_len)
|
||||
}
|
||||
|
||||
async fn flush(&mut self) -> std::io::Result<()> {
|
||||
let buf = self.buf.take().expect("must not use after an error");
|
||||
if buf.is_empty() {
|
||||
let buf_len = buf.pending();
|
||||
if buf_len == 0 {
|
||||
self.buf = Some(buf);
|
||||
return std::io::Result::Ok(());
|
||||
return Ok(());
|
||||
}
|
||||
let buf_len = buf.len();
|
||||
let (nwritten, mut buf) = self.writer.write_all(buf).await?;
|
||||
let (nwritten, io_buf) = self.writer.write_all(buf.flush()).await?;
|
||||
assert_eq!(nwritten, buf_len);
|
||||
buf.clear();
|
||||
self.buf = Some(buf);
|
||||
self.buf = Some(Buffer::reuse_after_flush(io_buf));
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A [`Buffer`] is used by [`BufferedWriter`] to batch smaller writes into larger ones.
|
||||
pub trait Buffer {
|
||||
type IoBuf: IoBuf;
|
||||
|
||||
/// Capacity of the buffer. Must not change over the lifetime `self`.`
|
||||
fn cap(&self) -> usize;
|
||||
|
||||
/// Add data to the buffer.
|
||||
/// Panics if there is not enough room to accomodate `other`'s content, i.e.,
|
||||
/// panics if `other.len() > self.cap() - self.pending()`.
|
||||
fn extend_from_slice(&mut self, other: &[u8]);
|
||||
|
||||
/// Number of bytes in the buffer.
|
||||
fn pending(&self) -> usize;
|
||||
|
||||
/// Turns `self` into a [`tokio_epoll_uring::Slice`] of the pending data
|
||||
/// so we can use [`tokio_epoll_uring`] to write it to disk.
|
||||
fn flush(self) -> Slice<Self::IoBuf>;
|
||||
|
||||
/// After the write to disk is done and we have gotten back the slice,
|
||||
/// [`BufferedWriter`] uses this method to re-use the io buffer.
|
||||
fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
|
||||
}
|
||||
|
||||
impl Buffer for BytesMut {
|
||||
type IoBuf = BytesMut;
|
||||
|
||||
#[inline(always)]
|
||||
fn cap(&self) -> usize {
|
||||
self.capacity()
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, other: &[u8]) {
|
||||
BytesMut::extend_from_slice(self, other)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn pending(&self) -> usize {
|
||||
self.len()
|
||||
}
|
||||
|
||||
fn flush(self) -> Slice<BytesMut> {
|
||||
if self.is_empty() {
|
||||
return self.slice_full();
|
||||
}
|
||||
let len = self.len();
|
||||
self.slice(0..len)
|
||||
}
|
||||
|
||||
fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
|
||||
iobuf.clear();
|
||||
iobuf
|
||||
}
|
||||
}
|
||||
|
||||
impl OwnedAsyncWriter for Vec<u8> {
|
||||
async fn write_all<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
|
||||
&mut self,
|
||||
@@ -125,6 +217,8 @@ impl OwnedAsyncWriter for Vec<u8> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::BytesMut;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Default)]
|
||||
@@ -158,7 +252,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_buffered_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"b");
|
||||
write!(writer, b"c");
|
||||
@@ -175,7 +269,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_writes_only() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
write!(writer, b"abc");
|
||||
write!(writer, b"de");
|
||||
write!(writer, b"");
|
||||
@@ -191,7 +285,7 @@ mod tests {
|
||||
#[tokio::test]
|
||||
async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::<2, _>::new(recorder);
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
write!(writer, b"a");
|
||||
write!(writer, b"bc");
|
||||
write!(writer, b"d");
|
||||
@@ -203,4 +297,31 @@ mod tests {
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
|
||||
let recorder = RecorderWriter::default();
|
||||
let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
|
||||
|
||||
writer.write_buffered_borrowed(b"abc").await?;
|
||||
writer.write_buffered_borrowed(b"d").await?;
|
||||
writer.write_buffered_borrowed(b"e").await?;
|
||||
writer.write_buffered_borrowed(b"fg").await?;
|
||||
writer.write_buffered_borrowed(b"hi").await?;
|
||||
writer.write_buffered_borrowed(b"j").await?;
|
||||
writer.write_buffered_borrowed(b"klmno").await?;
|
||||
|
||||
let recorder = writer.flush_and_into_inner().await?;
|
||||
assert_eq!(
|
||||
recorder.writes,
|
||||
{
|
||||
let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
|
||||
expect
|
||||
}
|
||||
.iter()
|
||||
.map(|v| v[..].to_vec())
|
||||
.collect::<Vec<_>>()
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -403,7 +403,7 @@ impl WalIngest {
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if self.shard.is_zero() {
|
||||
if self.shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
self.observe_decoded_block(modification, blk, ctx).await?;
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
/// Process lifecycle and abstracction for the IPC protocol.
|
||||
mod process;
|
||||
pub use process::Kind as ProcessKind;
|
||||
|
||||
/// Code to apply [`NeonWalRecord`]s.
|
||||
pub(crate) mod apply_neon;
|
||||
@@ -34,7 +35,7 @@ use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Context;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::key_to_rel_block;
|
||||
use pageserver_api::models::WalRedoManagerStatus;
|
||||
use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
@@ -54,7 +55,7 @@ pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
|
||||
/// The current [`process::Process`] that is used by new redo requests.
|
||||
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
|
||||
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// their process object; we use [`Arc::clone`] for that.
|
||||
@@ -66,7 +67,7 @@ pub struct PostgresRedoManager {
|
||||
/// still be using the old redo process. But, those other tasks will most likely
|
||||
/// encounter an error as well, and errors are an unexpected condition anyway.
|
||||
/// So, probably we could get rid of the `Arc` in the future.
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -139,8 +140,8 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
|
||||
Some(WalRedoManagerStatus {
|
||||
pub fn status(&self) -> WalRedoManagerStatus {
|
||||
WalRedoManagerStatus {
|
||||
last_redo_at: {
|
||||
let at = *self.last_redo_at.lock().unwrap();
|
||||
at.and_then(|at| {
|
||||
@@ -149,8 +150,14 @@ impl PostgresRedoManager {
|
||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||
})
|
||||
},
|
||||
pid: self.redo_process.get().map(|p| p.id()),
|
||||
})
|
||||
process: self
|
||||
.redo_process
|
||||
.get()
|
||||
.map(|p| WalRedoManagerProcessStatus {
|
||||
pid: p.id(),
|
||||
kind: std::borrow::Cow::Borrowed(p.kind().into()),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,37 +215,33 @@ impl PostgresRedoManager {
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let proc: Arc<process::WalRedoProcess> =
|
||||
match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let result = proc
|
||||
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
.context("apply_wal_records");
|
||||
|
||||
let duration = started_at.elapsed();
|
||||
|
||||
@@ -1,186 +1,67 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use std::time::Duration;
|
||||
|
||||
use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Write},
|
||||
process::{ChildStdin, ChildStdout, Command, Stdio},
|
||||
sync::{Mutex, MutexGuard},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, nonblock::set_nonblock};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{config::PageServerConf, walrecord::NeonWalRecord};
|
||||
|
||||
mod no_leak_child;
|
||||
/// The IPC protocol that pageserver and walredo process speak over their shared pipe.
|
||||
mod protocol;
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
mod process_impl {
|
||||
pub(super) mod process_async;
|
||||
pub(super) mod process_std;
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
#[derive(
|
||||
Clone,
|
||||
Copy,
|
||||
Debug,
|
||||
PartialEq,
|
||||
Eq,
|
||||
strum_macros::EnumString,
|
||||
strum_macros::Display,
|
||||
strum_macros::IntoStaticStr,
|
||||
serde_with::DeserializeFromStr,
|
||||
serde_with::SerializeDisplay,
|
||||
)]
|
||||
#[strum(serialize_all = "kebab-case")]
|
||||
#[repr(u8)]
|
||||
pub enum Kind {
|
||||
Sync,
|
||||
Async,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
pub(crate) enum Process {
|
||||
Sync(process_impl::process_std::WalRedoProcess),
|
||||
Async(process_impl::process_async::WalRedoProcess),
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
impl Process {
|
||||
#[inline(always)]
|
||||
pub fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
Ok(match conf.walredo_process_kind {
|
||||
Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
pg_version,
|
||||
)?),
|
||||
Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
pg_version,
|
||||
)?),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) fn apply_wal_records(
|
||||
#[inline(always)]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
@@ -188,221 +69,29 @@ impl WalRedoProcess {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
match self {
|
||||
Process::Sync(p) => {
|
||||
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
Process::Async(p) => {
|
||||
p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
match self {
|
||||
Process::Sync(p) => p.id(),
|
||||
Process::Async(p) => p.id(),
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
pub(crate) fn kind(&self) -> Kind {
|
||||
match self {
|
||||
Process::Sync(_) => Kind::Sync,
|
||||
Process::Async(_) => Kind::Async,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
374
pageserver/src/walredo/process/process_impl/process_async.rs
Normal file
374
pageserver/src/walredo/process/process_impl/process_async.rs
Normal file
@@ -0,0 +1,374 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::process::{no_leak_child, protocol},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
process::{Command, Stdio},
|
||||
time::Duration,
|
||||
};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, poison::Poison};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
|
||||
stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: tokio::process::ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: tokio::process::ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
let stdin =
|
||||
tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
|
||||
let stdout = tokio::process::ChildStdout::from_std(stdout)
|
||||
.context("convert to tokio::ChildStdout")?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdin",
|
||||
ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
},
|
||||
)),
|
||||
stdout: tokio::sync::Mutex::new(Poison::new(
|
||||
"stdout",
|
||||
ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
},
|
||||
)),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
/// Apply given WAL records ('records') over an old page image. Returns
|
||||
/// new page image.
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// Cancellation safe.
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let Ok(res) =
|
||||
tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
|
||||
else {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
};
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// When not polled to completion (e.g. because in `tokio::select!` another
|
||||
/// branch becomes ready before this future), concurrent and subsequent
|
||||
/// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
|
||||
/// Dispose of this process instance and create a new one.
|
||||
async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
|
||||
let request_no = {
|
||||
let mut lock_guard = self.stdin.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let input = poison_guard.data_mut();
|
||||
input
|
||||
.stdin
|
||||
.write_all(writebuf)
|
||||
.await
|
||||
.context("write to walredo stdin")?;
|
||||
let request_no = input.n_requests;
|
||||
input.n_requests += 1;
|
||||
poison_guard.disarm();
|
||||
request_no
|
||||
};
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut lock_guard = self.stdout.lock().await;
|
||||
let mut poison_guard = lock_guard.check_and_arm()?;
|
||||
let output = poison_guard.data_mut();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
output
|
||||
.stdout
|
||||
.read_exact(&mut resultbuf)
|
||||
.await
|
||||
.context("read walredo stdout")?;
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
poison_guard.disarm();
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
use std::io::Write;
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
405
pageserver/src/walredo/process/process_impl/process_std.rs
Normal file
405
pageserver/src/walredo/process/process_impl/process_std.rs
Normal file
@@ -0,0 +1,405 @@
|
||||
use self::no_leak_child::NoLeakChild;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
|
||||
walrecord::NeonWalRecord,
|
||||
walredo::process::{no_leak_child, protocol},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use bytes::Bytes;
|
||||
use nix::poll::{PollFd, PollFlags};
|
||||
use pageserver_api::{reltag::RelTag, shard::TenantShardId};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use std::os::fd::AsRawFd;
|
||||
#[cfg(feature = "testing")]
|
||||
use std::sync::atomic::AtomicUsize;
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
io::{Read, Write},
|
||||
process::{ChildStdin, ChildStdout, Command, Stdio},
|
||||
sync::{Mutex, MutexGuard},
|
||||
time::Duration,
|
||||
};
|
||||
use tracing::{debug, error, instrument, Instrument};
|
||||
use utils::{lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
pub struct WalRedoProcess {
|
||||
#[allow(dead_code)]
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
// Some() on construction, only becomes None on Drop.
|
||||
child: Option<NoLeakChild>,
|
||||
stdout: Mutex<ProcessOutput>,
|
||||
stdin: Mutex<ProcessInput>,
|
||||
/// Counter to separate same sized walredo inputs failing at the same millisecond.
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize,
|
||||
}
|
||||
|
||||
struct ProcessInput {
|
||||
stdin: ChildStdin,
|
||||
n_requests: usize,
|
||||
}
|
||||
|
||||
struct ProcessOutput {
|
||||
stdout: ChildStdout,
|
||||
pending_responses: VecDeque<Option<Bytes>>,
|
||||
n_processed_responses: usize,
|
||||
}
|
||||
|
||||
impl WalRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(pg_version=pg_version))]
|
||||
pub(crate) fn launch(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Self> {
|
||||
crate::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
|
||||
let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
|
||||
|
||||
use no_leak_child::NoLeakChildCommandExt;
|
||||
// Start postgres itself
|
||||
let child = Command::new(pg_bin_dir_path.join("postgres"))
|
||||
// the first arg must be --wal-redo so the child process enters into walredo mode
|
||||
.arg("--wal-redo")
|
||||
// the child doesn't process this arg, but, having it in the argv helps indentify the
|
||||
// walredo process for a particular tenant when debugging a pagserver
|
||||
.args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
.env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
|
||||
// NB: The redo process is not trusted after we sent it the first
|
||||
// walredo work. Before that, it is trusted. Specifically, we trust
|
||||
// it to
|
||||
// 1. close all file descriptors except stdin, stdout, stderr because
|
||||
// pageserver might not be 100% diligent in setting FD_CLOEXEC on all
|
||||
// the files it opens, and
|
||||
// 2. to use seccomp to sandbox itself before processing the first
|
||||
// walredo request.
|
||||
.spawn_no_leak_child(tenant_shard_id)
|
||||
.context("spawn process")?;
|
||||
WAL_REDO_PROCESS_COUNTERS.started.inc();
|
||||
let mut child = scopeguard::guard(child, |child| {
|
||||
error!("killing wal-redo-postgres process due to a problem during launch");
|
||||
child.kill_and_wait(WalRedoKillCause::Startup);
|
||||
});
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stderr = tokio::process::ChildStderr::from_std(stderr)
|
||||
.context("convert to tokio::ChildStderr")?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
|
||||
// all fallible operations post-spawn are complete, so get rid of the guard
|
||||
let child = scopeguard::ScopeGuard::into_inner(child);
|
||||
|
||||
tokio::spawn(
|
||||
async move {
|
||||
scopeguard::defer! {
|
||||
debug!("wal-redo-postgres stderr_logger_task finished");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
|
||||
}
|
||||
debug!("wal-redo-postgres stderr_logger_task started");
|
||||
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
|
||||
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
let mut stderr_lines = tokio::io::BufReader::new(stderr);
|
||||
let mut buf = Vec::new();
|
||||
let res = loop {
|
||||
buf.clear();
|
||||
// TODO we don't trust the process to cap its stderr length.
|
||||
// Currently it can do unbounded Vec allocation.
|
||||
match stderr_lines.read_until(b'\n', &mut buf).await {
|
||||
Ok(0) => break Ok(()), // eof
|
||||
Ok(num_bytes) => {
|
||||
let output = String::from_utf8_lossy(&buf[..num_bytes]);
|
||||
error!(%output, "received output");
|
||||
}
|
||||
Err(e) => {
|
||||
break Err(e);
|
||||
}
|
||||
}
|
||||
};
|
||||
match res {
|
||||
Ok(()) => (),
|
||||
Err(e) => {
|
||||
error!(error=?e, "failed to read from walredo stderr");
|
||||
}
|
||||
}
|
||||
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
|
||||
);
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
child: Some(child),
|
||||
stdin: Mutex::new(ProcessInput {
|
||||
stdin,
|
||||
n_requests: 0,
|
||||
}),
|
||||
stdout: Mutex::new(ProcessOutput {
|
||||
stdout,
|
||||
pending_responses: VecDeque::new(),
|
||||
n_processed_responses: 0,
|
||||
}),
|
||||
#[cfg(feature = "testing")]
|
||||
dump_sequence: AtomicUsize::default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn id(&self) -> u32 {
|
||||
self.child
|
||||
.as_ref()
|
||||
.expect("must not call this during Drop")
|
||||
.id()
|
||||
}
|
||||
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
|
||||
pub(crate) async fn apply_wal_records(
|
||||
&self,
|
||||
rel: RelTag,
|
||||
blknum: u32,
|
||||
base_img: &Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
let input = self.stdin.lock().unwrap();
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
//
|
||||
// Most requests start with a before-image with BLCKSZ bytes, followed by
|
||||
// by some other WAL records. Start with a buffer that can hold that
|
||||
// comfortably.
|
||||
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
|
||||
protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
protocol::build_push_page_msg(tag, img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
if let NeonWalRecord::Postgres {
|
||||
will_init: _,
|
||||
rec: postgres_rec,
|
||||
} = rec
|
||||
{
|
||||
protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
|
||||
} else {
|
||||
anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
|
||||
}
|
||||
}
|
||||
protocol::build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
|
||||
|
||||
if res.is_err() {
|
||||
// not all of these can be caused by this particular input, however these are so rare
|
||||
// in tests so capture all.
|
||||
self.record_and_log(&writebuf);
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
fn apply_wal_records0(
|
||||
&self,
|
||||
writebuf: &[u8],
|
||||
input: MutexGuard<ProcessInput>,
|
||||
wal_redo_timeout: Duration,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
|
||||
let mut nwrite = 0usize;
|
||||
|
||||
while nwrite < writebuf.len() {
|
||||
let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If 'stdin' is writeable, do write.
|
||||
let in_revents = stdin_pollfds[0].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
|
||||
}
|
||||
if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
|
||||
}
|
||||
}
|
||||
let request_no = proc.n_requests;
|
||||
proc.n_requests += 1;
|
||||
drop(proc);
|
||||
|
||||
// To improve walredo performance we separate sending requests and receiving
|
||||
// responses. Them are protected by different mutexes (output and input).
|
||||
// If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
|
||||
// then there is not warranty that T1 will first granted output mutex lock.
|
||||
// To address this issue we maintain number of sent requests, number of processed
|
||||
// responses and ring buffer with pending responses. After sending response
|
||||
// (under input mutex), threads remembers request number. Then it releases
|
||||
// input mutex, locks output mutex and fetch in ring buffer all responses until
|
||||
// its stored request number. The it takes correspondent element from
|
||||
// pending responses ring buffer and truncate all empty elements from the front,
|
||||
// advancing processed responses number.
|
||||
|
||||
let mut output = self.stdout.lock().unwrap();
|
||||
let n_processed_responses = output.n_processed_responses;
|
||||
while n_processed_responses + output.pending_responses.len() <= request_no {
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
while nresult < BLCKSZ.into() {
|
||||
let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
|
||||
// We do two things simultaneously: reading response from stdout
|
||||
// and forward any logging information that the child writes to its stderr to the page server's log.
|
||||
let n = loop {
|
||||
match nix::poll::poll(
|
||||
&mut stdout_pollfds[..],
|
||||
wal_redo_timeout.as_millis() as i32,
|
||||
) {
|
||||
Err(nix::errno::Errno::EINTR) => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
|
||||
if n == 0 {
|
||||
anyhow::bail!("WAL redo timed out");
|
||||
}
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = stdout_pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
|
||||
}
|
||||
if out_revents.contains(PollFlags::POLLHUP) {
|
||||
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
|
||||
}
|
||||
}
|
||||
output
|
||||
.pending_responses
|
||||
.push_back(Some(Bytes::from(resultbuf)));
|
||||
}
|
||||
// Replace our request's response with None in `pending_responses`.
|
||||
// Then make space in the ring buffer by clearing out any seqence of contiguous
|
||||
// `None`'s from the front of `pending_responses`.
|
||||
// NB: We can't pop_front() because other requests' responses because another
|
||||
// requester might have grabbed the output mutex before us:
|
||||
// T1: grab input mutex
|
||||
// T1: send request_no 23
|
||||
// T1: release input mutex
|
||||
// T2: grab input mutex
|
||||
// T2: send request_no 24
|
||||
// T2: release input mutex
|
||||
// T2: grab output mutex
|
||||
// T2: n_processed_responses + output.pending_responses.len() <= request_no
|
||||
// 23 0 24
|
||||
// T2: enters poll loop that reads stdout
|
||||
// T2: put response for 23 into pending_responses
|
||||
// T2: put response for 24 into pending_resposnes
|
||||
// pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
|
||||
// T2: takes its response_24
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Some(response_23) None Back
|
||||
// T2: releases output mutex
|
||||
// T1: grabs output mutex
|
||||
// T1: n_processed_responses + output.pending_responses.len() > request_no
|
||||
// 23 2 23
|
||||
// T1: skips poll loop that reads stdout
|
||||
// T1: takes its response_23
|
||||
// pending_responses now looks like this: Front None None Back
|
||||
// T2: does the while loop below
|
||||
// pending_responses now looks like this: Front Back
|
||||
// n_processed_responses now has value 25
|
||||
let res = output.pending_responses[request_no - n_processed_responses]
|
||||
.take()
|
||||
.expect("we own this request_no, nobody else is supposed to take it");
|
||||
while let Some(front) = output.pending_responses.front() {
|
||||
if front.is_none() {
|
||||
output.pending_responses.pop_front();
|
||||
output.n_processed_responses += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn record_and_log(&self, writebuf: &[u8]) {
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
let millis = std::time::SystemTime::now()
|
||||
.duration_since(std::time::SystemTime::UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis();
|
||||
|
||||
let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
|
||||
|
||||
// these files will be collected to an allure report
|
||||
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
|
||||
|
||||
let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
|
||||
|
||||
let res = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.read(true)
|
||||
.open(path)
|
||||
.and_then(|mut f| f.write_all(writebuf));
|
||||
|
||||
// trip up allowed_errors
|
||||
if let Err(e) = res {
|
||||
tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
|
||||
} else {
|
||||
tracing::error!(filename, "erroring walredo input saved");
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "testing"))]
|
||||
fn record_and_log(&self, _: &[u8]) {}
|
||||
}
|
||||
|
||||
impl Drop for WalRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
self.child
|
||||
.take()
|
||||
.expect("we only do this once")
|
||||
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
|
||||
// no way to wait for stderr_logger_task from Drop because that is async only
|
||||
}
|
||||
}
|
||||
@@ -2,8 +2,15 @@ mod classic;
|
||||
mod hacks;
|
||||
mod link;
|
||||
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use ipnet::{Ipv4Net, Ipv6Net};
|
||||
pub use link::LinkAuthError;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_postgres::config::AuthKeys;
|
||||
use tracing::{info, warn};
|
||||
|
||||
use crate::auth::credentials::check_peer_addr_is_in_list;
|
||||
use crate::auth::validate_password_and_exchange;
|
||||
@@ -16,6 +23,7 @@ use crate::intern::EndpointIdInt;
|
||||
use crate::metrics::Metrics;
|
||||
use crate::proxy::connect_compute::ComputeConnectBackend;
|
||||
use crate::proxy::NeonOptions;
|
||||
use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
|
||||
use crate::stream::Stream;
|
||||
use crate::{
|
||||
auth::{self, ComputeUserInfoMaybeEndpoint},
|
||||
@@ -28,9 +36,6 @@ use crate::{
|
||||
stream, url,
|
||||
};
|
||||
use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{info, warn};
|
||||
|
||||
/// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
|
||||
pub enum MaybeOwned<'a, T> {
|
||||
@@ -176,11 +181,45 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
|
||||
pub struct MaskedIp(IpAddr);
|
||||
|
||||
impl MaskedIp {
|
||||
fn new(value: IpAddr, prefix: u8) -> Self {
|
||||
match value {
|
||||
IpAddr::V4(v4) => Self(IpAddr::V4(
|
||||
Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
|
||||
)),
|
||||
IpAddr::V6(v6) => Self(IpAddr::V6(
|
||||
Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This can't be just per IP because that would limit some PaaS that share IP addresses
|
||||
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
|
||||
|
||||
impl RateBucketInfo {
|
||||
/// All of these are per endpoint-maskedip pair.
|
||||
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
|
||||
///
|
||||
/// First bucket: 1000mcpus total per endpoint-ip pair
|
||||
/// * 4096000 requests per second with 1 hash rounds.
|
||||
/// * 1000 requests per second with 4096 hash rounds.
|
||||
/// * 6.8 requests per second with 600000 hash rounds.
|
||||
pub const DEFAULT_AUTH_SET: [Self; 3] = [
|
||||
Self::new(1000 * 4096, Duration::from_secs(1)),
|
||||
Self::new(600 * 4096, Duration::from_secs(60)),
|
||||
Self::new(300 * 4096, Duration::from_secs(600)),
|
||||
];
|
||||
}
|
||||
|
||||
impl AuthenticationConfig {
|
||||
pub fn check_rate_limit(
|
||||
&self,
|
||||
|
||||
ctx: &mut RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
secret: AuthSecret,
|
||||
endpoint: &EndpointId,
|
||||
is_cleartext: bool,
|
||||
@@ -201,9 +240,13 @@ impl AuthenticationConfig {
|
||||
1
|
||||
};
|
||||
|
||||
let limit_not_exceeded = self
|
||||
.rate_limiter
|
||||
.check((endpoint_int, ctx.peer_addr), password_weight);
|
||||
let limit_not_exceeded = self.rate_limiter.check(
|
||||
(
|
||||
endpoint_int,
|
||||
MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
|
||||
),
|
||||
password_weight,
|
||||
);
|
||||
|
||||
if !limit_not_exceeded {
|
||||
warn!(
|
||||
@@ -271,6 +314,7 @@ async fn auth_quirks(
|
||||
let secret = match secret {
|
||||
Some(secret) => config.check_rate_limit(
|
||||
ctx,
|
||||
config,
|
||||
secret,
|
||||
&info.endpoint,
|
||||
unauthenticated_password.is_some() || allow_cleartext,
|
||||
@@ -473,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
use std::{net::IpAddr, sync::Arc, time::Duration};
|
||||
|
||||
use bytes::BytesMut;
|
||||
use fallible_iterator::FallibleIterator;
|
||||
@@ -486,7 +530,7 @@ mod tests {
|
||||
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
|
||||
|
||||
use crate::{
|
||||
auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
|
||||
auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
|
||||
config::AuthenticationConfig,
|
||||
console::{
|
||||
self,
|
||||
@@ -495,12 +539,12 @@ mod tests {
|
||||
},
|
||||
context::RequestMonitoring,
|
||||
proxy::NeonOptions,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
rate_limiter::RateBucketInfo,
|
||||
scram::ServerSecret,
|
||||
stream::{PqStream, Stream},
|
||||
};
|
||||
|
||||
use super::auth_quirks;
|
||||
use super::{auth_quirks, AuthRateLimiter};
|
||||
|
||||
struct Auth {
|
||||
ips: Vec<IpPattern>,
|
||||
@@ -541,6 +585,7 @@ mod tests {
|
||||
scram_protocol_timeout: std::time::Duration::from_secs(5),
|
||||
rate_limiter_enabled: true,
|
||||
rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
|
||||
rate_limit_ip_subnet: 64,
|
||||
});
|
||||
|
||||
async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
|
||||
@@ -552,6 +597,51 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn masked_ip() {
|
||||
let ip_a = IpAddr::V4([127, 0, 0, 1].into());
|
||||
let ip_b = IpAddr::V4([127, 0, 0, 2].into());
|
||||
let ip_c = IpAddr::V4([192, 168, 1, 101].into());
|
||||
let ip_d = IpAddr::V4([192, 168, 1, 102].into());
|
||||
let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
|
||||
let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
|
||||
|
||||
assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
|
||||
assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
|
||||
assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
|
||||
assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
|
||||
|
||||
assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
|
||||
assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_auth_rate_limit_set() {
|
||||
// these values used to exceed u32::MAX
|
||||
assert_eq!(
|
||||
RateBucketInfo::DEFAULT_AUTH_SET,
|
||||
[
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(1),
|
||||
max_rpi: 1000 * 4096,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(60),
|
||||
max_rpi: 600 * 4096 * 60,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(600),
|
||||
max_rpi: 300 * 4096 * 600,
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
for x in RateBucketInfo::DEFAULT_AUTH_SET {
|
||||
let y = x.to_string().parse().unwrap();
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn auth_quirks_scram() {
|
||||
let (mut client, server) = tokio::io::duplex(1024);
|
||||
|
||||
@@ -9,15 +9,13 @@ use futures::future::Either;
|
||||
use itertools::Itertools;
|
||||
use proxy::config::TlsServerEndPoint;
|
||||
use proxy::context::RequestMonitoring;
|
||||
use proxy::proxy::run_until_cancelled;
|
||||
use proxy::{BranchId, EndpointId, ProjectId};
|
||||
use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
|
||||
use rustls::pki_types::PrivateKeyDer;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use clap::Arg;
|
||||
use futures::TryFutureExt;
|
||||
use proxy::console::messages::MetricsAuxInfo;
|
||||
use proxy::stream::{PqStream, Stream};
|
||||
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -204,6 +202,7 @@ async fn task_main(
|
||||
const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
|
||||
|
||||
async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
ctx: &mut RequestMonitoring,
|
||||
raw_stream: S,
|
||||
tls_config: Arc<rustls::ServerConfig>,
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
@@ -233,7 +232,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
|
||||
Ok(Stream::Tls {
|
||||
tls: Box::new(raw.upgrade(tls_config).await?),
|
||||
tls: Box::new(
|
||||
raw.upgrade(tls_config, !ctx.has_private_peer_addr())
|
||||
.await?,
|
||||
),
|
||||
tls_server_end_point,
|
||||
})
|
||||
}
|
||||
@@ -256,7 +258,7 @@ async fn handle_client(
|
||||
tls_server_end_point: TlsServerEndPoint,
|
||||
stream: impl AsyncRead + AsyncWrite + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
|
||||
let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;
|
||||
|
||||
// Cut off first part of the SNI domain
|
||||
// We receive required destination details in the format of
|
||||
@@ -273,18 +275,15 @@ async fn handle_client(
|
||||
|
||||
info!("destination: {}", destination);
|
||||
|
||||
let client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
|
||||
endpoint_id: (&EndpointId::from("")).into(),
|
||||
project_id: (&ProjectId::from("")).into(),
|
||||
branch_id: (&BranchId::from("")).into(),
|
||||
cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
|
||||
};
|
||||
let mut client = tokio::net::TcpStream::connect(destination).await?;
|
||||
|
||||
// doesn't yet matter as pg-sni-router doesn't report analytics logs
|
||||
ctx.set_success();
|
||||
ctx.log();
|
||||
|
||||
proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
|
||||
// Starting from here we only proxy the client's traffic.
|
||||
info!("performing the proxy pass...");
|
||||
let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use aws_config::provider_config::ProviderConfig;
|
||||
use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
|
||||
use futures::future::Either;
|
||||
use proxy::auth;
|
||||
use proxy::auth::backend::AuthRateLimiter;
|
||||
use proxy::auth::backend::MaybeOwned;
|
||||
use proxy::cancellation::CancelMap;
|
||||
use proxy::cancellation::CancellationHandler;
|
||||
@@ -20,10 +21,8 @@ use proxy::context::parquet::ParquetUploadArgs;
|
||||
use proxy::http;
|
||||
use proxy::http::health_server::AppMetrics;
|
||||
use proxy::metrics::Metrics;
|
||||
use proxy::rate_limiter::AuthRateLimiter;
|
||||
use proxy::rate_limiter::EndpointRateLimiter;
|
||||
use proxy::rate_limiter::RateBucketInfo;
|
||||
use proxy::rate_limiter::RateLimiterConfig;
|
||||
use proxy::redis::cancellation_publisher::RedisPublisherClient;
|
||||
use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
|
||||
use proxy::redis::elasticache;
|
||||
@@ -43,6 +42,7 @@ use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::info;
|
||||
use tracing::warn;
|
||||
use tracing::Instrument;
|
||||
use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
@@ -132,14 +132,8 @@ struct ProxyCliArgs {
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
require_client_ip: bool,
|
||||
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
|
||||
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
|
||||
disable_dynamic_rate_limiter: bool,
|
||||
/// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
|
||||
#[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
|
||||
rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
|
||||
/// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
|
||||
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
|
||||
rate_limiter_timeout: tokio::time::Duration,
|
||||
/// Endpoint rate limiter max number of requests per second.
|
||||
///
|
||||
/// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
|
||||
@@ -152,14 +146,12 @@ struct ProxyCliArgs {
|
||||
/// Authentication rate limiter max number of hashes per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
|
||||
auth_rate_limit: Vec<RateBucketInfo>,
|
||||
/// The IP subnet to use when considering whether two IP addresses are considered the same.
|
||||
#[clap(long, default_value_t = 64)]
|
||||
auth_rate_limit_ip_subnet: u8,
|
||||
/// Redis rate limiter max number of requests per second.
|
||||
#[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
|
||||
redis_rps_limit: Vec<RateBucketInfo>,
|
||||
/// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
|
||||
#[clap(long, default_value_t = 100)]
|
||||
initial_limit: usize,
|
||||
#[clap(flatten)]
|
||||
aimd_config: proxy::rate_limiter::AimdConfig,
|
||||
/// cache for `allowed_ips` (use `size=0` to disable)
|
||||
#[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
|
||||
allowed_ips_cache: String,
|
||||
@@ -427,7 +419,8 @@ async fn main() -> anyhow::Result<()> {
|
||||
if let Some(regional_redis_client) = regional_redis_client {
|
||||
let cache = api.caches.endpoints_cache.clone();
|
||||
let con = regional_redis_client;
|
||||
maintenance_tasks.spawn(async move { cache.do_read(con).await });
|
||||
let span = tracing::info_span!("endpoints_cache");
|
||||
maintenance_tasks.spawn(async move { cache.do_read(con).await }.instrument(span));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -494,13 +487,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
and metric-collection-interval must be specified"
|
||||
),
|
||||
};
|
||||
let rate_limiter_config = RateLimiterConfig {
|
||||
disable: args.disable_dynamic_rate_limiter,
|
||||
algorithm: args.rate_limit_algorithm,
|
||||
timeout: args.rate_limiter_timeout,
|
||||
initial_limit: args.initial_limit,
|
||||
aimd_config: Some(args.aimd_config),
|
||||
};
|
||||
if !args.disable_dynamic_rate_limiter {
|
||||
bail!("dynamic rate limiter should be disabled");
|
||||
}
|
||||
|
||||
let auth_backend = match &args.auth_backend {
|
||||
AuthBackend::Console => {
|
||||
@@ -542,7 +531,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
tokio::spawn(locks.garbage_collect_worker());
|
||||
|
||||
let url = args.auth_endpoint.parse()?;
|
||||
let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
|
||||
let endpoint = http::Endpoint::new(url, http::new_client());
|
||||
|
||||
let api = console::provider::neon::Api::new(endpoint, caches, locks);
|
||||
let api = console::provider::ConsoleBackend::Console(api);
|
||||
@@ -575,6 +564,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
|
||||
scram_protocol_timeout: args.scram_protocol_timeout,
|
||||
rate_limiter_enabled: args.auth_rate_limit_enabled,
|
||||
rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
|
||||
rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
|
||||
};
|
||||
|
||||
let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
|
||||
|
||||
9
proxy/src/cache/endpoints.rs
vendored
9
proxy/src/cache/endpoints.rs
vendored
@@ -13,6 +13,7 @@ use redis::{
|
||||
};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
config::EndpointCacheConfig,
|
||||
@@ -71,7 +72,9 @@ impl EndpointsCache {
|
||||
}
|
||||
// If cache is disabled, just collect the metrics and return.
|
||||
if self.config.disable_cache {
|
||||
ctx.set_rejected(self.should_reject(endpoint));
|
||||
let rejected = self.should_reject(endpoint);
|
||||
ctx.set_rejected(rejected);
|
||||
info!(?rejected, "check endpoint is valid, disabled cache");
|
||||
return true;
|
||||
}
|
||||
// If the limiter allows, we don't need to check the cache.
|
||||
@@ -79,6 +82,7 @@ impl EndpointsCache {
|
||||
return true;
|
||||
}
|
||||
let rejected = self.should_reject(endpoint);
|
||||
info!(?rejected, "check endpoint is valid, enabled cache");
|
||||
ctx.set_rejected(rejected);
|
||||
!rejected
|
||||
}
|
||||
@@ -171,6 +175,9 @@ impl EndpointsCache {
|
||||
|
||||
if res.keys.is_empty() {
|
||||
if return_when_finish {
|
||||
if total != 0 {
|
||||
break;
|
||||
}
|
||||
anyhow::bail!(
|
||||
"Redis stream {} is empty, cannot be used to filter endpoints",
|
||||
self.config.stream_name
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::{
|
||||
auth,
|
||||
rate_limiter::{AuthRateLimiter, RateBucketInfo},
|
||||
auth::{self, backend::AuthRateLimiter},
|
||||
rate_limiter::RateBucketInfo,
|
||||
serverless::GlobalConnPoolOptions,
|
||||
};
|
||||
use anyhow::{bail, ensure, Context, Ok};
|
||||
@@ -58,6 +58,7 @@ pub struct AuthenticationConfig {
|
||||
pub scram_protocol_timeout: tokio::time::Duration,
|
||||
pub rate_limiter_enabled: bool,
|
||||
pub rate_limiter: AuthRateLimiter,
|
||||
pub rate_limit_ip_subnet: u8,
|
||||
}
|
||||
|
||||
impl TlsConfig {
|
||||
|
||||
@@ -5,7 +5,7 @@ use once_cell::sync::OnceCell;
|
||||
use smol_str::SmolStr;
|
||||
use std::net::IpAddr;
|
||||
use tokio::sync::mpsc;
|
||||
use tracing::{field::display, info_span, Span};
|
||||
use tracing::{field::display, info, info_span, Span};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
@@ -76,6 +76,7 @@ impl RequestMonitoring {
|
||||
?session_id,
|
||||
%peer_addr,
|
||||
ep = tracing::field::Empty,
|
||||
role = tracing::field::Empty,
|
||||
);
|
||||
|
||||
Self {
|
||||
@@ -157,6 +158,7 @@ impl RequestMonitoring {
|
||||
}
|
||||
|
||||
pub fn set_user(&mut self, user: RoleName) {
|
||||
self.span.record("role", display(&user));
|
||||
self.user = Some(user);
|
||||
}
|
||||
|
||||
@@ -164,8 +166,18 @@ impl RequestMonitoring {
|
||||
self.auth_method = Some(auth_method);
|
||||
}
|
||||
|
||||
pub fn has_private_peer_addr(&self) -> bool {
|
||||
match self.peer_addr {
|
||||
IpAddr::V4(ip) => ip.is_private(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_error_kind(&mut self, kind: ErrorKind) {
|
||||
Metrics::get().proxy.errors_total.inc(kind);
|
||||
// Do not record errors from the private address to metrics.
|
||||
if !self.has_private_peer_addr() {
|
||||
Metrics::get().proxy.errors_total.inc(kind);
|
||||
}
|
||||
if let Some(ep) = &self.endpoint_id {
|
||||
let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
|
||||
let label = metric.with_labels(kind);
|
||||
@@ -188,12 +200,25 @@ impl Drop for RequestMonitoring {
|
||||
} else {
|
||||
ConnectOutcome::Failed
|
||||
};
|
||||
let rejected = self.rejected;
|
||||
let ep = self
|
||||
.endpoint_id
|
||||
.as_ref()
|
||||
.map(|x| x.as_str())
|
||||
.unwrap_or_default();
|
||||
// This makes sense only if cache is disabled
|
||||
info!(
|
||||
?ep,
|
||||
?outcome,
|
||||
?rejected,
|
||||
"check endpoint is valid with outcome"
|
||||
);
|
||||
Metrics::get()
|
||||
.proxy
|
||||
.invalid_endpoints_total
|
||||
.inc(InvalidEndpointsGroup {
|
||||
protocol: self.protocol,
|
||||
rejected: self.rejected.into(),
|
||||
rejected: rejected.into(),
|
||||
outcome,
|
||||
});
|
||||
if let Some(tx) = self.sender.take() {
|
||||
|
||||
@@ -15,7 +15,6 @@ use tracing::trace;
|
||||
|
||||
use crate::{
|
||||
metrics::{ConsoleRequest, Metrics},
|
||||
rate_limiter,
|
||||
url::ApiUrl,
|
||||
};
|
||||
use reqwest_middleware::RequestBuilder;
|
||||
@@ -23,7 +22,7 @@ use reqwest_middleware::RequestBuilder;
|
||||
/// This is the preferred way to create new http clients,
|
||||
/// because it takes care of observability (OpenTelemetry).
|
||||
/// We deliberately don't want to replace this with a public static.
|
||||
pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
|
||||
pub fn new_client() -> ClientWithMiddleware {
|
||||
let client = reqwest::ClientBuilder::new()
|
||||
.dns_resolver(Arc::new(GaiResolver::default()))
|
||||
.connection_verbose(true)
|
||||
@@ -32,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien
|
||||
|
||||
reqwest_middleware::ClientBuilder::new(client)
|
||||
.with(reqwest_tracing::TracingMiddleware::default())
|
||||
.with(rate_limiter::Limiter::new(rate_limiter_config))
|
||||
.build()
|
||||
}
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@ use lasso::ThreadedRodeo;
|
||||
use measured::{
|
||||
label::StaticLabelSet,
|
||||
metric::{histogram::Thresholds, name::MetricName},
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
|
||||
LabelGroup, MetricGroup,
|
||||
Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
|
||||
MetricGroup,
|
||||
};
|
||||
use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
|
||||
|
||||
@@ -20,9 +20,6 @@ pub struct Metrics {
|
||||
|
||||
#[metric(namespace = "wake_compute_lock")]
|
||||
pub wake_compute_lock: ApiLockMetrics,
|
||||
|
||||
// the one metric not called proxy_....
|
||||
pub semaphore_control_plane_limit: GaugeVec<StaticLabelSet<RateLimit>>,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
@@ -31,7 +28,6 @@ impl Metrics {
|
||||
SELF.get_or_init(|| Metrics {
|
||||
proxy: ProxyMetrics::default(),
|
||||
wake_compute_lock: ApiLockMetrics::new(),
|
||||
semaphore_control_plane_limit: GaugeVec::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -286,13 +282,6 @@ pub enum LatencyExclusions {
|
||||
ClientAndCplane,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "limit")]
|
||||
pub enum RateLimit {
|
||||
Actual,
|
||||
Expected,
|
||||
}
|
||||
|
||||
#[derive(FixedCardinalityLabel, Copy, Clone)]
|
||||
#[label(singleton = "kind")]
|
||||
pub enum SniKind {
|
||||
|
||||
@@ -7,6 +7,7 @@ pub mod handshake;
|
||||
pub mod passthrough;
|
||||
pub mod retry;
|
||||
pub mod wake_compute;
|
||||
pub use copy_bidirectional::copy_bidirectional_client_compute;
|
||||
|
||||
use crate::{
|
||||
auth,
|
||||
@@ -256,8 +257,9 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
|
||||
let tls = config.tls_config.as_ref();
|
||||
|
||||
let record_handshake_error = !ctx.has_private_peer_addr();
|
||||
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Client);
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls));
|
||||
let do_handshake = handshake(stream, mode.handshake_tls(tls), record_handshake_error);
|
||||
let (mut stream, params) =
|
||||
match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
|
||||
@@ -41,7 +41,7 @@ where
|
||||
}
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
pub(super) async fn copy_bidirectional_client_compute<Client, Compute>(
|
||||
pub async fn copy_bidirectional_client_compute<Client, Compute>(
|
||||
client: &mut Client,
|
||||
compute: &mut Compute,
|
||||
) -> Result<(u64, u64), std::io::Error>
|
||||
|
||||
@@ -63,6 +63,7 @@ pub enum HandshakeData<S> {
|
||||
pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
stream: S,
|
||||
mut tls: Option<&TlsConfig>,
|
||||
record_handshake_error: bool,
|
||||
) -> Result<HandshakeData<S>, HandshakeError> {
|
||||
// Client may try upgrading to each protocol only once
|
||||
let (mut tried_ssl, mut tried_gss) = (false, false);
|
||||
@@ -95,7 +96,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
if !read_buf.is_empty() {
|
||||
return Err(HandshakeError::EarlyData);
|
||||
}
|
||||
let tls_stream = raw.upgrade(tls.to_server_config()).await?;
|
||||
let tls_stream = raw
|
||||
.upgrade(tls.to_server_config(), record_handshake_error)
|
||||
.await?;
|
||||
|
||||
let (_, tls_server_end_point) = tls
|
||||
.cert_resolver
|
||||
|
||||
@@ -175,7 +175,7 @@ async fn dummy_proxy(
|
||||
auth: impl TestAuth + Send,
|
||||
) -> anyhow::Result<()> {
|
||||
let client = WithClientIp::new(client);
|
||||
let mut stream = match handshake(client, tls.as_ref()).await? {
|
||||
let mut stream = match handshake(client, tls.as_ref(), false).await? {
|
||||
HandshakeData::Startup(stream, _) => stream,
|
||||
HandshakeData::Cancel(_) => bail!("cancellation not supported"),
|
||||
};
|
||||
|
||||
@@ -34,7 +34,10 @@ async fn proxy_mitm(
|
||||
tokio::spawn(async move {
|
||||
// begin handshake with end_server
|
||||
let end_server = connect_tls(server2, client_config2.make_tls_connect().unwrap()).await;
|
||||
let (end_client, startup) = match handshake(client1, Some(&server_config1)).await.unwrap() {
|
||||
let (end_client, startup) = match handshake(client1, Some(&server_config1), false)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
HandshakeData::Startup(stream, params) => (stream, params),
|
||||
HandshakeData::Cancel(_) => panic!("cancellation not supported"),
|
||||
};
|
||||
|
||||
@@ -1,7 +1,2 @@
|
||||
mod aimd;
|
||||
mod limit_algorithm;
|
||||
mod limiter;
|
||||
pub use aimd::Aimd;
|
||||
pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
|
||||
pub use limiter::Limiter;
|
||||
pub use limiter::{AuthRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
|
||||
pub use limiter::{BucketRateLimiter, EndpointRateLimiter, GlobalRateLimiter, RateBucketInfo};
|
||||
|
||||
@@ -1,166 +0,0 @@
|
||||
use std::usize;
|
||||
|
||||
use async_trait::async_trait;
|
||||
|
||||
use super::limit_algorithm::{AimdConfig, LimitAlgorithm, Sample};
|
||||
|
||||
use super::limiter::Outcome;
|
||||
|
||||
/// Loss-based congestion avoidance.
|
||||
///
|
||||
/// Additive-increase, multiplicative decrease.
|
||||
///
|
||||
/// Adds available currency when:
|
||||
/// 1. no load-based errors are observed, and
|
||||
/// 2. the utilisation of the current limit is high.
|
||||
///
|
||||
/// Reduces available concurrency by a factor when load-based errors are detected.
|
||||
pub struct Aimd {
|
||||
min_limit: usize,
|
||||
max_limit: usize,
|
||||
decrease_factor: f32,
|
||||
increase_by: usize,
|
||||
min_utilisation_threshold: f32,
|
||||
}
|
||||
|
||||
impl Aimd {
|
||||
pub fn new(config: AimdConfig) -> Self {
|
||||
Self {
|
||||
min_limit: config.aimd_min_limit,
|
||||
max_limit: config.aimd_max_limit,
|
||||
decrease_factor: config.aimd_decrease_factor,
|
||||
increase_by: config.aimd_increase_by,
|
||||
min_utilisation_threshold: config.aimd_min_utilisation_threshold,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl LimitAlgorithm for Aimd {
|
||||
async fn update(&mut self, old_limit: usize, sample: Sample) -> usize {
|
||||
use Outcome::*;
|
||||
match sample.outcome {
|
||||
Success => {
|
||||
let utilisation = sample.in_flight as f32 / old_limit as f32;
|
||||
|
||||
if utilisation > self.min_utilisation_threshold {
|
||||
let limit = old_limit + self.increase_by;
|
||||
limit.clamp(self.min_limit, self.max_limit)
|
||||
} else {
|
||||
old_limit
|
||||
}
|
||||
}
|
||||
Overload => {
|
||||
let limit = old_limit as f32 * self.decrease_factor;
|
||||
|
||||
// Floor instead of round, so the limit reduces even with small numbers.
|
||||
// E.g. round(2 * 0.9) = 2, but floor(2 * 0.9) = 1
|
||||
let limit = limit.floor() as usize;
|
||||
|
||||
limit.clamp(self.min_limit, self.max_limit)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use tokio::sync::Notify;
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::rate_limiter::{Limiter, RateLimiterConfig};
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_decrease_limit_on_overload() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 10,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let release_notifier = Arc::new(Notify::new());
|
||||
|
||||
let limiter = Limiter::new(config).with_release_notifier(release_notifier.clone());
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
limiter.release(token, Some(Outcome::Overload)).await;
|
||||
release_notifier.notified().await;
|
||||
assert_eq!(limiter.state().limit(), 5, "overload: decrease");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_increase_limit_on_success_when_using_gt_util_threshold() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 4,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
aimd_min_utilisation_threshold: 0.5,
|
||||
aimd_increase_by: 1,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
let _token = limiter.try_acquire().unwrap();
|
||||
let _token = limiter.try_acquire().unwrap();
|
||||
|
||||
limiter.release(token, Some(Outcome::Success)).await;
|
||||
assert_eq!(limiter.state().limit(), 5, "success: increase");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_not_change_limit_on_success_when_using_lt_util_threshold() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 4,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
aimd_min_utilisation_threshold: 0.5,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
|
||||
limiter.release(token, Some(Outcome::Success)).await;
|
||||
assert_eq!(
|
||||
limiter.state().limit(),
|
||||
4,
|
||||
"success: ignore when < half limit"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn should_not_change_limit_when_no_outcome() {
|
||||
let config = RateLimiterConfig {
|
||||
initial_limit: 10,
|
||||
aimd_config: Some(AimdConfig {
|
||||
aimd_decrease_factor: 0.5,
|
||||
aimd_min_utilisation_threshold: 0.5,
|
||||
..Default::default()
|
||||
}),
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
limiter.release(token, None).await;
|
||||
assert_eq!(limiter.state().limit(), 10, "ignore");
|
||||
}
|
||||
}
|
||||
@@ -1,98 +0,0 @@
|
||||
//! Algorithms for controlling concurrency limits.
|
||||
use async_trait::async_trait;
|
||||
use std::time::Duration;
|
||||
|
||||
use super::{limiter::Outcome, Aimd};
|
||||
|
||||
/// An algorithm for controlling a concurrency limit.
|
||||
#[async_trait]
|
||||
pub trait LimitAlgorithm: Send + Sync + 'static {
|
||||
/// Update the concurrency limit in response to a new job completion.
|
||||
async fn update(&mut self, old_limit: usize, sample: Sample) -> usize;
|
||||
}
|
||||
|
||||
/// The result of a job (or jobs), including the [Outcome] (loss) and latency (delay).
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Sample {
|
||||
pub(crate) latency: Duration,
|
||||
/// Jobs in flight when the sample was taken.
|
||||
pub(crate) in_flight: usize,
|
||||
pub(crate) outcome: Outcome,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, clap::ValueEnum)]
|
||||
pub enum RateLimitAlgorithm {
|
||||
Fixed,
|
||||
#[default]
|
||||
Aimd,
|
||||
}
|
||||
|
||||
pub struct Fixed;
|
||||
|
||||
#[async_trait]
|
||||
impl LimitAlgorithm for Fixed {
|
||||
async fn update(&mut self, old_limit: usize, _sample: Sample) -> usize {
|
||||
old_limit
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct RateLimiterConfig {
|
||||
pub disable: bool,
|
||||
pub algorithm: RateLimitAlgorithm,
|
||||
pub timeout: Duration,
|
||||
pub initial_limit: usize,
|
||||
pub aimd_config: Option<AimdConfig>,
|
||||
}
|
||||
|
||||
impl RateLimiterConfig {
|
||||
pub fn create_rate_limit_algorithm(self) -> Box<dyn LimitAlgorithm> {
|
||||
match self.algorithm {
|
||||
RateLimitAlgorithm::Fixed => Box::new(Fixed),
|
||||
RateLimitAlgorithm::Aimd => Box::new(Aimd::new(self.aimd_config.unwrap())), // For aimd algorithm config is mandatory.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for RateLimiterConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
disable: true,
|
||||
algorithm: RateLimitAlgorithm::Aimd,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 100,
|
||||
aimd_config: Some(AimdConfig::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(clap::Parser, Clone, Copy, Debug)]
|
||||
pub struct AimdConfig {
|
||||
/// Minimum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 1)]
|
||||
pub aimd_min_limit: usize,
|
||||
/// Maximum limit for AIMD algorithm. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 1500)]
|
||||
pub aimd_max_limit: usize,
|
||||
/// Increase AIMD increase by value in case of success. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 10)]
|
||||
pub aimd_increase_by: usize,
|
||||
/// Decrease AIMD decrease by value in case of timout/429. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 0.9)]
|
||||
pub aimd_decrease_factor: f32,
|
||||
/// A threshold below which the limit won't be increased. Makes sense only if `rate_limit_algorithm` is `Aimd`.
|
||||
#[clap(long, default_value_t = 0.8)]
|
||||
pub aimd_min_utilisation_threshold: f32,
|
||||
}
|
||||
|
||||
impl Default for AimdConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
aimd_min_limit: 1,
|
||||
aimd_max_limit: 1500,
|
||||
aimd_increase_by: 10,
|
||||
aimd_decrease_factor: 0.9,
|
||||
aimd_min_utilisation_threshold: 0.8,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,10 +2,9 @@ use std::{
|
||||
borrow::Cow,
|
||||
collections::hash_map::RandomState,
|
||||
hash::{BuildHasher, Hash},
|
||||
net::IpAddr,
|
||||
sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc, Mutex,
|
||||
Mutex,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -13,22 +12,10 @@ use anyhow::bail;
|
||||
use dashmap::DashMap;
|
||||
use itertools::Itertools;
|
||||
use rand::{rngs::StdRng, Rng, SeedableRng};
|
||||
use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit};
|
||||
use tokio::time::{timeout, Duration, Instant};
|
||||
use tokio::time::{Duration, Instant};
|
||||
use tracing::info;
|
||||
|
||||
use crate::{
|
||||
intern::EndpointIdInt,
|
||||
{
|
||||
metrics::{Metrics, RateLimit},
|
||||
EndpointId,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
limit_algorithm::{LimitAlgorithm, Sample},
|
||||
RateLimiterConfig,
|
||||
};
|
||||
use crate::EndpointId;
|
||||
|
||||
pub struct GlobalRateLimiter {
|
||||
data: Vec<RateBucket>,
|
||||
@@ -81,9 +68,6 @@ impl GlobalRateLimiter {
|
||||
// I went with a more expensive way that yields user-friendlier error messages.
|
||||
pub type EndpointRateLimiter = BucketRateLimiter<EndpointId, StdRng, RandomState>;
|
||||
|
||||
// This can't be just per IP because that would limit some PaaS that share IP addresses
|
||||
pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, IpAddr), StdRng, RandomState>;
|
||||
|
||||
pub struct BucketRateLimiter<Key, Rand = StdRng, Hasher = RandomState> {
|
||||
map: DashMap<Key, Vec<RateBucket>, Hasher>,
|
||||
info: Cow<'static, [RateBucketInfo]>,
|
||||
@@ -155,19 +139,6 @@ impl RateBucketInfo {
|
||||
Self::new(100, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
/// All of these are per endpoint-ip pair.
|
||||
/// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
|
||||
///
|
||||
/// First bucket: 300mcpus total per endpoint-ip pair
|
||||
/// * 1228800 requests per second with 1 hash rounds. (endpoint rate limiter will catch this first)
|
||||
/// * 300 requests per second with 4096 hash rounds.
|
||||
/// * 2 requests per second with 600000 hash rounds.
|
||||
pub const DEFAULT_AUTH_SET: [Self; 3] = [
|
||||
Self::new(300 * 4096, Duration::from_secs(1)),
|
||||
Self::new(200 * 4096, Duration::from_secs(60)),
|
||||
Self::new(100 * 4096, Duration::from_secs(600)),
|
||||
];
|
||||
|
||||
pub fn validate(info: &mut [Self]) -> anyhow::Result<()> {
|
||||
info.sort_unstable_by_key(|info| info.interval);
|
||||
let invalid = info
|
||||
@@ -265,423 +236,16 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Limits the number of concurrent jobs.
|
||||
///
|
||||
/// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the
|
||||
/// token once the job is finished.
|
||||
///
|
||||
/// The limit will be automatically adjusted based on observed latency (delay) and/or failures
|
||||
/// caused by overload (loss).
|
||||
pub struct Limiter {
|
||||
limit_algo: AsyncMutex<Box<dyn LimitAlgorithm>>,
|
||||
semaphore: std::sync::Arc<Semaphore>,
|
||||
config: RateLimiterConfig,
|
||||
|
||||
// ONLY WRITE WHEN LIMIT_ALGO IS LOCKED
|
||||
limits: AtomicUsize,
|
||||
|
||||
// ONLY USE ATOMIC ADD/SUB
|
||||
in_flight: Arc<AtomicUsize>,
|
||||
|
||||
#[cfg(test)]
|
||||
notifier: Option<std::sync::Arc<tokio::sync::Notify>>,
|
||||
}
|
||||
|
||||
/// A concurrency token, required to run a job.
|
||||
///
|
||||
/// Release the token back to the [Limiter] after the job is complete.
|
||||
#[derive(Debug)]
|
||||
pub struct Token<'t> {
|
||||
permit: Option<tokio::sync::SemaphorePermit<'t>>,
|
||||
start: Instant,
|
||||
in_flight: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
/// A snapshot of the state of the [Limiter].
|
||||
///
|
||||
/// Not guaranteed to be consistent under high concurrency.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct LimiterState {
|
||||
limit: usize,
|
||||
in_flight: usize,
|
||||
}
|
||||
|
||||
/// Whether a job succeeded or failed as a result of congestion/overload.
|
||||
///
|
||||
/// Errors not considered to be caused by overload should be ignored.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum Outcome {
|
||||
/// The job succeeded, or failed in a way unrelated to overload.
|
||||
Success,
|
||||
/// The job failed because of overload, e.g. it timed out or an explicit backpressure signal
|
||||
/// was observed.
|
||||
Overload,
|
||||
}
|
||||
|
||||
impl Outcome {
|
||||
fn from_reqwest_error(error: &reqwest_middleware::Error) -> Self {
|
||||
match error {
|
||||
reqwest_middleware::Error::Middleware(_) => Outcome::Success,
|
||||
reqwest_middleware::Error::Reqwest(e) => {
|
||||
if let Some(status) = e.status() {
|
||||
if status.is_server_error()
|
||||
|| reqwest::StatusCode::TOO_MANY_REQUESTS.as_u16() == status
|
||||
{
|
||||
Outcome::Overload
|
||||
} else {
|
||||
Outcome::Success
|
||||
}
|
||||
} else {
|
||||
Outcome::Success
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fn from_reqwest_response(response: &reqwest::Response) -> Self {
|
||||
if response.status().is_server_error()
|
||||
|| response.status() == reqwest::StatusCode::TOO_MANY_REQUESTS
|
||||
{
|
||||
Outcome::Overload
|
||||
} else {
|
||||
Outcome::Success
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Limiter {
|
||||
/// Create a limiter with a given limit control algorithm.
|
||||
pub fn new(config: RateLimiterConfig) -> Self {
|
||||
assert!(config.initial_limit > 0);
|
||||
Self {
|
||||
limit_algo: AsyncMutex::new(config.create_rate_limit_algorithm()),
|
||||
semaphore: Arc::new(Semaphore::new(config.initial_limit)),
|
||||
config,
|
||||
limits: AtomicUsize::new(config.initial_limit),
|
||||
in_flight: Arc::new(AtomicUsize::new(0)),
|
||||
#[cfg(test)]
|
||||
notifier: None,
|
||||
}
|
||||
}
|
||||
// pub fn new(limit_algorithm: T, timeout: Duration, initial_limit: usize) -> Self {
|
||||
// assert!(initial_limit > 0);
|
||||
|
||||
// Self {
|
||||
// limit_algo: AsyncMutex::new(limit_algorithm),
|
||||
// semaphore: Arc::new(Semaphore::new(initial_limit)),
|
||||
// timeout,
|
||||
// limits: AtomicUsize::new(initial_limit),
|
||||
// in_flight: Arc::new(AtomicUsize::new(0)),
|
||||
// #[cfg(test)]
|
||||
// notifier: None,
|
||||
// }
|
||||
// }
|
||||
|
||||
/// In some cases [Token]s are acquired asynchronously when updating the limit.
|
||||
#[cfg(test)]
|
||||
pub fn with_release_notifier(mut self, n: std::sync::Arc<tokio::sync::Notify>) -> Self {
|
||||
self.notifier = Some(n);
|
||||
self
|
||||
}
|
||||
|
||||
/// Try to immediately acquire a concurrency [Token].
|
||||
///
|
||||
/// Returns `None` if there are none available.
|
||||
pub fn try_acquire(&self) -> Option<Token> {
|
||||
let result = if self.config.disable {
|
||||
// If the rate limiter is disabled, we can always acquire a token.
|
||||
Some(Token::new(None, self.in_flight.clone()))
|
||||
} else {
|
||||
self.semaphore
|
||||
.try_acquire()
|
||||
.map(|permit| Token::new(Some(permit), self.in_flight.clone()))
|
||||
.ok()
|
||||
};
|
||||
if result.is_some() {
|
||||
self.in_flight.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Try to acquire a concurrency [Token], waiting for `duration` if there are none available.
|
||||
///
|
||||
/// Returns `None` if there are none available after `duration`.
|
||||
pub async fn acquire_timeout(&self, duration: Duration) -> Option<Token<'_>> {
|
||||
info!("acquiring token: {:?}", self.semaphore.available_permits());
|
||||
let result = if self.config.disable {
|
||||
// If the rate limiter is disabled, we can always acquire a token.
|
||||
Some(Token::new(None, self.in_flight.clone()))
|
||||
} else {
|
||||
match timeout(duration, self.semaphore.acquire()).await {
|
||||
Ok(maybe_permit) => maybe_permit
|
||||
.map(|permit| Token::new(Some(permit), self.in_flight.clone()))
|
||||
.ok(),
|
||||
Err(_) => None,
|
||||
}
|
||||
};
|
||||
if result.is_some() {
|
||||
self.in_flight.fetch_add(1, Ordering::AcqRel);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
/// Return the concurrency [Token], along with the outcome of the job.
|
||||
///
|
||||
/// The [Outcome] of the job, and the time taken to perform it, may be used
|
||||
/// to update the concurrency limit.
|
||||
///
|
||||
/// Set the outcome to `None` to ignore the job.
|
||||
pub async fn release(&self, mut token: Token<'_>, outcome: Option<Outcome>) {
|
||||
tracing::info!("outcome is {:?}", outcome);
|
||||
let in_flight = self.in_flight.load(Ordering::Acquire);
|
||||
let old_limit = self.limits.load(Ordering::Acquire);
|
||||
let available = if self.config.disable {
|
||||
0 // This is not used in the algorithm and can be anything. If the config disable it makes sense to set it to 0.
|
||||
} else {
|
||||
self.semaphore.available_permits()
|
||||
};
|
||||
let total = in_flight + available;
|
||||
|
||||
let mut algo = self.limit_algo.lock().await;
|
||||
|
||||
let new_limit = if let Some(outcome) = outcome {
|
||||
let sample = Sample {
|
||||
latency: token.start.elapsed(),
|
||||
in_flight,
|
||||
outcome,
|
||||
};
|
||||
algo.update(old_limit, sample).await
|
||||
} else {
|
||||
old_limit
|
||||
};
|
||||
tracing::info!("new limit is {}", new_limit);
|
||||
let actual_limit = if new_limit < total {
|
||||
token.forget();
|
||||
total.saturating_sub(1)
|
||||
} else {
|
||||
if !self.config.disable {
|
||||
self.semaphore.add_permits(new_limit.saturating_sub(total));
|
||||
}
|
||||
new_limit
|
||||
};
|
||||
let metric = &Metrics::get().semaphore_control_plane_limit;
|
||||
metric.set(RateLimit::Expected, new_limit as i64);
|
||||
metric.set(RateLimit::Actual, actual_limit as i64);
|
||||
self.limits.store(new_limit, Ordering::Release);
|
||||
#[cfg(test)]
|
||||
if let Some(n) = &self.notifier {
|
||||
n.notify_one();
|
||||
}
|
||||
}
|
||||
|
||||
/// The current state of the limiter.
|
||||
pub fn state(&self) -> LimiterState {
|
||||
let limit = self.limits.load(Ordering::Relaxed);
|
||||
let in_flight = self.in_flight.load(Ordering::Relaxed);
|
||||
LimiterState { limit, in_flight }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Token<'t> {
|
||||
fn new(permit: Option<SemaphorePermit<'t>>, in_flight: Arc<AtomicUsize>) -> Self {
|
||||
Self {
|
||||
permit,
|
||||
start: Instant::now(),
|
||||
in_flight,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn forget(&mut self) {
|
||||
if let Some(permit) = self.permit.take() {
|
||||
permit.forget();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for Token<'_> {
|
||||
fn drop(&mut self) {
|
||||
self.in_flight.fetch_sub(1, Ordering::AcqRel);
|
||||
}
|
||||
}
|
||||
|
||||
impl LimiterState {
|
||||
/// The current concurrency limit.
|
||||
pub fn limit(&self) -> usize {
|
||||
self.limit
|
||||
}
|
||||
/// The number of jobs in flight.
|
||||
pub fn in_flight(&self) -> usize {
|
||||
self.in_flight
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl reqwest_middleware::Middleware for Limiter {
|
||||
async fn handle(
|
||||
&self,
|
||||
req: reqwest::Request,
|
||||
extensions: &mut task_local_extensions::Extensions,
|
||||
next: reqwest_middleware::Next<'_>,
|
||||
) -> reqwest_middleware::Result<reqwest::Response> {
|
||||
let timer = Metrics::get()
|
||||
.proxy
|
||||
.control_plane_token_acquire_seconds
|
||||
.start_timer();
|
||||
let token = self
|
||||
.acquire_timeout(self.config.timeout)
|
||||
.await
|
||||
.ok_or_else(|| {
|
||||
reqwest_middleware::Error::Middleware(
|
||||
// TODO: Should we map it into user facing errors?
|
||||
crate::console::errors::ApiError::Console {
|
||||
status: crate::http::StatusCode::TOO_MANY_REQUESTS,
|
||||
text: "Too many requests".into(),
|
||||
}
|
||||
.into(),
|
||||
)
|
||||
})?;
|
||||
let duration = timer.observe();
|
||||
info!(
|
||||
?duration,
|
||||
"waiting for token to connect to the control plane"
|
||||
);
|
||||
|
||||
match next.run(req, extensions).await {
|
||||
Ok(response) => {
|
||||
self.release(token, Some(Outcome::from_reqwest_response(&response)))
|
||||
.await;
|
||||
Ok(response)
|
||||
}
|
||||
Err(e) => {
|
||||
self.release(token, Some(Outcome::from_reqwest_error(&e)))
|
||||
.await;
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{hash::BuildHasherDefault, pin::pin, task::Context, time::Duration};
|
||||
use std::{hash::BuildHasherDefault, time::Duration};
|
||||
|
||||
use futures::{task::noop_waker_ref, Future};
|
||||
use rand::SeedableRng;
|
||||
use rustc_hash::FxHasher;
|
||||
use tokio::time;
|
||||
|
||||
use super::{BucketRateLimiter, EndpointRateLimiter, Limiter, Outcome};
|
||||
use crate::{
|
||||
rate_limiter::{RateBucketInfo, RateLimitAlgorithm},
|
||||
EndpointId,
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn it_works() {
|
||||
let config = super::RateLimiterConfig {
|
||||
algorithm: RateLimitAlgorithm::Fixed,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 10,
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
let token = limiter.try_acquire().unwrap();
|
||||
|
||||
limiter.release(token, Some(Outcome::Success)).await;
|
||||
|
||||
assert_eq!(limiter.state().limit(), 10);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn is_fair() {
|
||||
let config = super::RateLimiterConfig {
|
||||
algorithm: RateLimitAlgorithm::Fixed,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 1,
|
||||
disable: false,
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
// === TOKEN 1 ===
|
||||
let token1 = limiter.try_acquire().unwrap();
|
||||
|
||||
let mut token2_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
|
||||
assert!(
|
||||
token2_fut
|
||||
.as_mut()
|
||||
.poll(&mut Context::from_waker(noop_waker_ref()))
|
||||
.is_pending(),
|
||||
"token is acquired by token1"
|
||||
);
|
||||
|
||||
let mut token3_fut = pin!(limiter.acquire_timeout(Duration::from_secs(1)));
|
||||
assert!(
|
||||
token3_fut
|
||||
.as_mut()
|
||||
.poll(&mut Context::from_waker(noop_waker_ref()))
|
||||
.is_pending(),
|
||||
"token is acquired by token1"
|
||||
);
|
||||
|
||||
limiter.release(token1, Some(Outcome::Success)).await;
|
||||
// === END TOKEN 1 ===
|
||||
|
||||
// === TOKEN 2 ===
|
||||
assert!(
|
||||
limiter.try_acquire().is_none(),
|
||||
"token is acquired by token2"
|
||||
);
|
||||
|
||||
assert!(
|
||||
token3_fut
|
||||
.as_mut()
|
||||
.poll(&mut Context::from_waker(noop_waker_ref()))
|
||||
.is_pending(),
|
||||
"token is acquired by token2"
|
||||
);
|
||||
|
||||
let token2 = token2_fut.await.unwrap();
|
||||
|
||||
limiter.release(token2, Some(Outcome::Success)).await;
|
||||
// === END TOKEN 2 ===
|
||||
|
||||
// === TOKEN 3 ===
|
||||
assert!(
|
||||
limiter.try_acquire().is_none(),
|
||||
"token is acquired by token3"
|
||||
);
|
||||
|
||||
let token3 = token3_fut.await.unwrap();
|
||||
limiter.release(token3, Some(Outcome::Success)).await;
|
||||
// === END TOKEN 3 ===
|
||||
|
||||
// === TOKEN 4 ===
|
||||
let token4 = limiter.try_acquire().unwrap();
|
||||
limiter.release(token4, Some(Outcome::Success)).await;
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn disable() {
|
||||
let config = super::RateLimiterConfig {
|
||||
algorithm: RateLimitAlgorithm::Fixed,
|
||||
timeout: Duration::from_secs(1),
|
||||
initial_limit: 1,
|
||||
disable: true,
|
||||
..Default::default()
|
||||
};
|
||||
let limiter = Limiter::new(config);
|
||||
|
||||
// === TOKEN 1 ===
|
||||
let token1 = limiter.try_acquire().unwrap();
|
||||
let token2 = limiter.try_acquire().unwrap();
|
||||
let state = limiter.state();
|
||||
assert_eq!(state.limit(), 1);
|
||||
assert_eq!(state.in_flight(), 2); // For disabled limiter, it's expected.
|
||||
limiter.release(token1, None).await;
|
||||
limiter.release(token2, None).await;
|
||||
}
|
||||
use super::{BucketRateLimiter, EndpointRateLimiter};
|
||||
use crate::{rate_limiter::RateBucketInfo, EndpointId};
|
||||
|
||||
#[test]
|
||||
fn rate_bucket_rpi() {
|
||||
@@ -783,31 +347,4 @@ mod tests {
|
||||
}
|
||||
assert!(limiter.map.len() < 150_000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_auth_set() {
|
||||
// these values used to exceed u32::MAX
|
||||
assert_eq!(
|
||||
RateBucketInfo::DEFAULT_AUTH_SET,
|
||||
[
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(1),
|
||||
max_rpi: 300 * 4096,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(60),
|
||||
max_rpi: 200 * 4096 * 60,
|
||||
},
|
||||
RateBucketInfo {
|
||||
interval: Duration::from_secs(600),
|
||||
max_rpi: 100 * 4096 * 600,
|
||||
}
|
||||
]
|
||||
);
|
||||
|
||||
for x in RateBucketInfo::DEFAULT_AUTH_SET {
|
||||
let y = x.to_string().parse().unwrap();
|
||||
assert_eq!(x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,10 +77,14 @@ impl ConnectionWithCredentialsProvider {
|
||||
}
|
||||
}
|
||||
|
||||
async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
|
||||
redis::cmd("PING").query_async(con).await
|
||||
}
|
||||
|
||||
pub async fn connect(&mut self) -> anyhow::Result<()> {
|
||||
let _guard = self.mutex.lock().await;
|
||||
if let Some(con) = self.con.as_mut() {
|
||||
match redis::cmd("PING").query_async(con).await {
|
||||
match Self::ping(con).await {
|
||||
Ok(()) => {
|
||||
return Ok(());
|
||||
}
|
||||
@@ -96,7 +100,7 @@ impl ConnectionWithCredentialsProvider {
|
||||
if let Some(f) = self.refresh_token_task.take() {
|
||||
f.abort()
|
||||
}
|
||||
let con = self
|
||||
let mut con = self
|
||||
.get_client()
|
||||
.await?
|
||||
.get_multiplexed_tokio_connection()
|
||||
@@ -109,6 +113,14 @@ impl ConnectionWithCredentialsProvider {
|
||||
});
|
||||
self.refresh_token_task = Some(f);
|
||||
}
|
||||
match Self::ping(&mut con).await {
|
||||
Ok(()) => {
|
||||
info!("Connection succesfully established");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Connection is broken. Error during PING: {e:?}");
|
||||
}
|
||||
}
|
||||
self.con = Some(con);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -172,6 +172,10 @@ async fn connection_handler(
|
||||
};
|
||||
|
||||
let peer_addr = peer.unwrap_or(peer_addr).ip();
|
||||
let has_private_peer_addr = match peer_addr {
|
||||
IpAddr::V4(ip) => ip.is_private(),
|
||||
_ => false,
|
||||
};
|
||||
info!(?session_id, %peer_addr, "accepted new TCP connection");
|
||||
|
||||
// try upgrade to TLS, but with a timeout.
|
||||
@@ -182,13 +186,17 @@ async fn connection_handler(
|
||||
}
|
||||
// The handshake failed
|
||||
Ok(Err(e)) => {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
if !has_private_peer_addr {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
}
|
||||
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
|
||||
return;
|
||||
}
|
||||
// The handshake timed out
|
||||
Err(e) => {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
if !has_private_peer_addr {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc();
|
||||
}
|
||||
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ use tracing::{field::display, info};
|
||||
use crate::{
|
||||
auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
|
||||
compute,
|
||||
config::ProxyConfig,
|
||||
config::{AuthenticationConfig, ProxyConfig},
|
||||
console::{
|
||||
errors::{GetAuthInfoError, WakeComputeError},
|
||||
CachedNodeInfo,
|
||||
@@ -27,6 +27,7 @@ impl PoolingBackend {
|
||||
pub async fn authenticate(
|
||||
&self,
|
||||
ctx: &mut RequestMonitoring,
|
||||
config: &AuthenticationConfig,
|
||||
conn_info: &ConnInfo,
|
||||
) -> Result<ComputeCredentials, AuthError> {
|
||||
let user_info = conn_info.user_info.clone();
|
||||
@@ -43,6 +44,7 @@ impl PoolingBackend {
|
||||
let secret = match cached_secret.value.clone() {
|
||||
Some(secret) => self.config.authentication_config.check_rate_limit(
|
||||
ctx,
|
||||
config,
|
||||
secret,
|
||||
&user_info.endpoint,
|
||||
true,
|
||||
|
||||
@@ -541,7 +541,9 @@ async fn handle_inner(
|
||||
.map_err(SqlOverHttpError::from);
|
||||
|
||||
let authenticate_and_connect = async {
|
||||
let keys = backend.authenticate(ctx, &conn_info).await?;
|
||||
let keys = backend
|
||||
.authenticate(ctx, &config.authentication_config, &conn_info)
|
||||
.await?;
|
||||
let client = backend
|
||||
.connect_to_compute(ctx, conn_info, keys, !allow_pool)
|
||||
.await?;
|
||||
|
||||
@@ -223,12 +223,20 @@ pub enum StreamUpgradeError {
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin> Stream<S> {
|
||||
/// If possible, upgrade raw stream into a secure TLS-based stream.
|
||||
pub async fn upgrade(self, cfg: Arc<ServerConfig>) -> Result<TlsStream<S>, StreamUpgradeError> {
|
||||
pub async fn upgrade(
|
||||
self,
|
||||
cfg: Arc<ServerConfig>,
|
||||
record_handshake_error: bool,
|
||||
) -> Result<TlsStream<S>, StreamUpgradeError> {
|
||||
match self {
|
||||
Stream::Raw { raw } => Ok(tokio_rustls::TlsAcceptor::from(cfg)
|
||||
.accept(raw)
|
||||
.await
|
||||
.inspect_err(|_| Metrics::get().proxy.tls_handshake_failures.inc())?),
|
||||
.inspect_err(|_| {
|
||||
if record_handshake_error {
|
||||
Metrics::get().proxy.tls_handshake_failures.inc()
|
||||
}
|
||||
})?),
|
||||
Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -495,7 +495,7 @@ mod tests {
|
||||
use url::Url;
|
||||
|
||||
use super::*;
|
||||
use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
|
||||
use crate::{http, BranchId, EndpointId};
|
||||
|
||||
#[tokio::test]
|
||||
async fn metrics() {
|
||||
@@ -525,7 +525,7 @@ mod tests {
|
||||
tokio::spawn(server);
|
||||
|
||||
let metrics = Metrics::default();
|
||||
let client = http::new_client(RateLimiterConfig::default());
|
||||
let client = http::new_client();
|
||||
let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
|
||||
let now = Utc::now();
|
||||
|
||||
|
||||
@@ -84,6 +84,20 @@ impl std::ops::Add for AffinityScore {
|
||||
}
|
||||
}
|
||||
|
||||
/// Hint for whether this is a sincere attempt to schedule, or a speculative
|
||||
/// check for where we _would_ schedule (done during optimization)
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum ScheduleMode {
|
||||
Normal,
|
||||
Speculative,
|
||||
}
|
||||
|
||||
impl Default for ScheduleMode {
|
||||
fn default() -> Self {
|
||||
Self::Normal
|
||||
}
|
||||
}
|
||||
|
||||
// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
|
||||
// it for many shards in the same tenant.
|
||||
#[derive(Debug, Default)]
|
||||
@@ -93,6 +107,8 @@ pub(crate) struct ScheduleContext {
|
||||
|
||||
/// Specifically how many _attached_ locations are on each node
|
||||
pub(crate) attached_nodes: HashMap<NodeId, usize>,
|
||||
|
||||
pub(crate) mode: ScheduleMode,
|
||||
}
|
||||
|
||||
impl ScheduleContext {
|
||||
@@ -329,27 +345,34 @@ impl Scheduler {
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
|
||||
if scores.is_empty() {
|
||||
// After applying constraints, no pageservers were left. We log some detail about
|
||||
// the state of nodes to help understand why this happened. This is not logged as an error because
|
||||
// it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
|
||||
tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
|
||||
for (node_id, node) in &self.nodes {
|
||||
// After applying constraints, no pageservers were left.
|
||||
if !matches!(context.mode, ScheduleMode::Speculative) {
|
||||
// If this was not a speculative attempt, log details to understand why we couldn't
|
||||
// schedule: this may help an engineer understand if some nodes are marked offline
|
||||
// in a way that's preventing progress.
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.shard_count
|
||||
"Scheduling failure, while excluding {hard_exclude:?}, node states:"
|
||||
);
|
||||
for (node_id, node) in &self.nodes {
|
||||
tracing::info!(
|
||||
"Node {node_id}: may_schedule={} shards={}",
|
||||
node.may_schedule != MaySchedule::No,
|
||||
node.shard_count
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return Err(ScheduleError::ImpossibleConstraint);
|
||||
}
|
||||
|
||||
// Lowest score wins
|
||||
let node_id = scores.first().unwrap().0;
|
||||
tracing::info!(
|
||||
|
||||
if !matches!(context.mode, ScheduleMode::Speculative) {
|
||||
tracing::info!(
|
||||
"scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
|
||||
scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
// Note that we do not update shard count here to reflect the scheduling: that
|
||||
// is IntentState's job when the scheduled location is used.
|
||||
|
||||
@@ -11,7 +11,7 @@ use crate::{
|
||||
id_lock_map::IdLockMap,
|
||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
||||
reconciler::ReconcileError,
|
||||
scheduler::ScheduleContext,
|
||||
scheduler::{ScheduleContext, ScheduleMode},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use control_plane::storage_controller::{
|
||||
@@ -2744,7 +2744,7 @@ impl Service {
|
||||
let mut describe_shards = Vec::new();
|
||||
|
||||
for shard in shards {
|
||||
if shard.tenant_shard_id.is_zero() {
|
||||
if shard.tenant_shard_id.is_shard_zero() {
|
||||
shard_zero = Some(shard);
|
||||
}
|
||||
|
||||
@@ -4084,7 +4084,7 @@ impl Service {
|
||||
|
||||
let mut reconciles_spawned = 0;
|
||||
for (tenant_shard_id, shard) in tenants.iter_mut() {
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
schedule_context = ScheduleContext::default();
|
||||
}
|
||||
|
||||
@@ -4134,9 +4134,10 @@ impl Service {
|
||||
let mut work = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in tenants.iter() {
|
||||
if tenant_shard_id.is_zero() {
|
||||
if tenant_shard_id.is_shard_zero() {
|
||||
// Reset accumulators on the first shard in a tenant
|
||||
schedule_context = ScheduleContext::default();
|
||||
schedule_context.mode = ScheduleMode::Speculative;
|
||||
tenant_shards.clear();
|
||||
}
|
||||
|
||||
|
||||
@@ -2449,10 +2449,12 @@ class NeonPageserver(PgProtocol):
|
||||
if cur_line_no < skip_until_line_no:
|
||||
cur_line_no += 1
|
||||
continue
|
||||
if contains_re.search(line):
|
||||
elif contains_re.search(line):
|
||||
# found it!
|
||||
cur_line_no += 1
|
||||
return (line, LogCursor(cur_line_no))
|
||||
else:
|
||||
cur_line_no += 1
|
||||
return None
|
||||
|
||||
def tenant_attach(
|
||||
|
||||
35
test_runner/regress/test_pageserver_config.py
Normal file
35
test_runner/regress/test_pageserver_config.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
last_flush_lsn_upload,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("kind", ["sync", "async"])
|
||||
def test_walredo_process_kind_config(neon_env_builder: NeonEnvBuilder, kind: str):
|
||||
neon_env_builder.pageserver_config_override = f"walredo_process_kind = '{kind}'"
|
||||
# ensure it starts
|
||||
env = neon_env_builder.init_start()
|
||||
# ensure the metric is set
|
||||
ps_http = env.pageserver.http_client()
|
||||
metrics = ps_http.get_metrics()
|
||||
samples = metrics.query_all("pageserver_wal_redo_process_kind")
|
||||
assert [(s.labels, s.value) for s in samples] == [({"kind": kind}, 1)]
|
||||
# ensure default tenant's config kind matches
|
||||
# => write some data to force-spawn walredo
|
||||
ep = env.endpoints.create_start("main")
|
||||
with ep.connect() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("create table foo(bar text)")
|
||||
cur.execute("insert into foo select from generate_series(1, 100)")
|
||||
last_flush_lsn_upload(env, ep, env.initial_tenant, env.initial_timeline)
|
||||
ep.stop()
|
||||
ep.start()
|
||||
with ep.connect() as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("select count(*) from foo")
|
||||
[(count,)] = cur.fetchall()
|
||||
assert count == 100
|
||||
|
||||
status = ps_http.tenant_status(env.initial_tenant)
|
||||
assert status["walredo"]["process"]["kind"] == kind
|
||||
@@ -469,7 +469,8 @@ def test_tenant_delete_concurrent(
|
||||
):
|
||||
"""
|
||||
Validate that concurrent delete requests to the same tenant behave correctly:
|
||||
exactly one should succeed.
|
||||
exactly one should execute: the rest should give 202 responses but not start
|
||||
another deletion.
|
||||
|
||||
This is a reproducer for https://github.com/neondatabase/neon/issues/5936
|
||||
"""
|
||||
@@ -484,14 +485,10 @@ def test_tenant_delete_concurrent(
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
CONFLICT_MESSAGE = "Precondition failed: Invalid state Stopping. Expected Active or Broken"
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# lucky race with stopping from flushing a layer we fail to schedule any uploads
|
||||
".*layer flush task.+: could not flush frozen layer: update_metadata_file",
|
||||
# Errors logged from our 4xx requests
|
||||
f".*{CONFLICT_MESSAGE}.*",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -507,7 +504,7 @@ def test_tenant_delete_concurrent(
|
||||
return ps_http.tenant_delete(tenant_id)
|
||||
|
||||
def hit_remove_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
|
||||
return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
|
||||
|
||||
def hit_run_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
|
||||
@@ -518,11 +515,14 @@ def test_tenant_delete_concurrent(
|
||||
|
||||
# Wait until the first request completes its work and is blocked on removing
|
||||
# the TenantSlot from tenant manager.
|
||||
wait_until(100, 0.1, hit_remove_failpoint)
|
||||
log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
|
||||
assert log_cursor is not None
|
||||
|
||||
# Start another request: this should fail when it sees a tenant in Stopping state
|
||||
with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
# Start another request: this should succeed without actually entering the deletion code
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
assert not env.pageserver.log_contains(
|
||||
f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
|
||||
)
|
||||
|
||||
# Start another background request, which will pause after acquiring a TenantSlotGuard
|
||||
# but before completing.
|
||||
@@ -539,8 +539,10 @@ def test_tenant_delete_concurrent(
|
||||
|
||||
# Permit the duplicate background request to run to completion and fail.
|
||||
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
|
||||
with pytest.raises(PageserverApiException, match=CONFLICT_MESSAGE):
|
||||
background_4xx_req.result(timeout=10)
|
||||
background_4xx_req.result(timeout=10)
|
||||
assert not env.pageserver.log_contains(
|
||||
f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
|
||||
)
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
|
||||
Reference in New Issue
Block a user