Compare commits

..

1 Commits

Author SHA1 Message Date
dependabot[bot]
03d6819084 build(deps): bump cryptography from 41.0.4 to 41.0.6
Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.4 to 41.0.6.
- [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst)
- [Commits](https://github.com/pyca/cryptography/compare/41.0.4...41.0.6)

---
updated-dependencies:
- dependency-name: cryptography
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
2023-11-29 00:08:20 +00:00
135 changed files with 2669 additions and 5333 deletions

522
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -45,11 +45,12 @@ azure_storage_blobs = "0.16"
flate2 = "1.0.26"
async-stream = "0.3"
async-trait = "0.1"
aws-config = { version = "1.0", default-features = false, features=["rustls"] }
aws-sdk-s3 = "1.0"
aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
aws-smithy-types = "1.0"
aws-credential-types = "1.0"
aws-config = { version = "0.56", default-features = false, features=["rustls"] }
aws-sdk-s3 = "0.29"
aws-smithy-http = "0.56"
aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
aws-credential-types = "0.56"
aws-types = "0.56"
axum = { version = "0.6.20", features = ["ws"] }
base64 = "0.13.0"
bincode = "1.3"
@@ -88,7 +89,6 @@ humantime-serde = "1.1.1"
hyper = "0.14"
hyper-tungstenite = "0.11"
inotify = "0.10.2"
ipnet = "2.9.0"
itertools = "0.10"
jsonwebtoken = "8"
libc = "0.2"
@@ -122,17 +122,14 @@ rustls-pemfile = "1"
rustls-split = "0.3"
scopeguard = "1.1"
sysinfo = "0.29.2"
sd-notify = "0.4.1"
sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_path_to_error = "0.1"
serde_with = "2.0"
serde_assert = "0.5.0"
sha2 = "0.10.2"
signal-hook = "0.3"
smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
strum = "0.24"
strum_macros = "0.24"

View File

@@ -393,9 +393,7 @@ RUN case "${PG_VERSION}" in \
export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
;; \
*) \
export TIMESCALEDB_VERSION=2.13.0 \
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
;; \
echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
esac && \
apt-get update && \
apt-get install -y cmake && \
@@ -731,7 +729,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control
#########################################################################################
#

View File

@@ -149,9 +149,6 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
# create postgres compute node
> cargo neon endpoint create main
# start postgres compute node
> cargo neon endpoint start main
Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
@@ -188,11 +185,8 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
(L) main [de200bd42b49cc1814412c7e592dd6e9]
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
# create postgres on that branch
> cargo neon endpoint create migration_check --branch-name migration_check
# start postgres on that branch
> cargo neon endpoint start migration_check
> cargo neon endpoint start migration_check --branch-name migration_check
Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'

View File

@@ -274,13 +274,7 @@ fn main() -> Result<()> {
let mut state = compute.state.lock().unwrap();
state.error = Some(format!("{:?}", err));
state.status = ComputeStatus::Failed;
// Notify others that Postgres failed to start. In case of configuring the
// empty compute, it's likely that API handler is still waiting for compute
// state change. With this we will notify it that compute is in Failed state,
// so control plane will know about it earlier and record proper error instead
// of timeout.
compute.state_changed.notify_all();
drop(state); // unlock
drop(state);
delay_exit = true;
None
}

View File

@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use compute_api::responses::{ComputeMetrics, ComputeStatus};
use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
use compute_api::spec::{ComputeMode, ComputeSpec};
use utils::measured_stream::MeasuredReader;
use remote_storage::{DownloadError, RemotePath};
@@ -277,17 +277,6 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
}
impl ComputeNode {
/// Check that compute node has corresponding feature enabled.
pub fn has_feature(&self, feature: ComputeFeature) -> bool {
let state = self.state.lock().unwrap();
if let Some(s) = state.pspec.as_ref() {
s.spec.features.contains(&feature)
} else {
false
}
}
pub fn set_status(&self, status: ComputeStatus) {
let mut state = self.state.lock().unwrap();
state.status = status;
@@ -739,12 +728,7 @@ impl ComputeNode {
// Write new config
let pgdata_path = Path::new(&self.pgdata);
let postgresql_conf_path = pgdata_path.join("postgresql.conf");
config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are reconfiguring:
// creating new extensions, roles, etc...
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
self.pg_reload_conf()?;
let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
@@ -765,10 +749,6 @@ impl ComputeNode {
// 'Close' connection
drop(client);
// reset max_cluster_size in config back to original value and reload config
config::compute_ctl_temp_override_remove(pgdata_path)?;
self.pg_reload_conf()?;
let unknown_op = "unknown".to_string();
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
info!(
@@ -829,17 +809,7 @@ impl ComputeNode {
let config_time = Utc::now();
if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
let pgdata_path = Path::new(&self.pgdata);
// temporarily reset max_cluster_size in config
// to avoid the possibility of hitting the limit, while we are applying config:
// creating new extensions, roles, etc...
config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
self.pg_reload_conf()?;
self.apply_config(&compute_state)?;
config::compute_ctl_temp_override_remove(pgdata_path)?;
self.pg_reload_conf()?;
}
let startup_end_time = Utc::now();

View File

@@ -93,25 +93,5 @@ pub fn write_postgres_conf(
writeln!(file, "neon.extension_server_port={}", port)?;
}
// This is essential to keep this line at the end of the file,
// because it is intended to override any settings above.
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
Ok(())
}
/// create file compute_ctl_temp_override.conf in pgdata_dir
/// add provided options to this file
pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
let path = pgdata_path.join("compute_ctl_temp_override.conf");
let mut file = File::create(path)?;
write!(file, "{}", options)?;
Ok(())
}
/// remove file compute_ctl_temp_override.conf in pgdata_dir
pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
let path = pgdata_path.join("compute_ctl_temp_override.conf");
std::fs::remove_file(path)?;
Ok(())
}

View File

@@ -227,7 +227,7 @@ async fn handle_configure_request(
let parsed_spec = match ParsedSpec::try_from(spec) {
Ok(ps) => ps,
Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
};
// XXX: wrap state update under lock in code blocks. Otherwise,

View File

@@ -156,17 +156,17 @@ paths:
description: Error text or 'OK' if download succeeded.
example: "OK"
400:
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
description: Request is invalid.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
500:
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
description: Extension download request failed.
content:
application/json:
schema:
$ref: "#/components/schemas/GenericError"
components:
securitySchemes:

View File

@@ -118,6 +118,19 @@ pub fn get_spec_from_control_plane(
spec
}
/// It takes cluster specification and does the following:
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
/// - Update `pg_hba.conf` to allow external connections.
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
// File `postgresql.conf` is no longer included into `basebackup`, so just
// always write all config into it creating new file.
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
update_pg_hba(pgdata_path)?;
Ok(())
}
/// Check `pg_hba.conf` and update if needed to allow external connections.
pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
// XXX: consider making it a part of spec.json

View File

@@ -9,7 +9,6 @@ use clap::Parser;
use hex::FromHex;
use hyper::StatusCode;
use hyper::{Body, Request, Response};
use pageserver_api::shard::TenantShardId;
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::{collections::HashMap, sync::Arc};
@@ -174,8 +173,7 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
if state.pageserver == Some(reattach_req.node_id) {
state.generation += 1;
response.tenants.push(ReAttachResponseTenant {
// TODO(sharding): make this shard-aware
id: TenantShardId::unsharded(*t),
id: *t,
gen: state.generation,
});
}
@@ -198,8 +196,7 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
};
for req_tenant in validate_req.tenants {
// TODO(sharding): make this shard-aware
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_state.generation == req_tenant.gen;
response.tenants.push(ValidateResponseTenant {
id: req_tenant.id,

View File

@@ -415,7 +415,6 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
None,
None,
Some(pg_version),
None,
)?;
let new_timeline_id = timeline_info.timeline_id;
let last_record_lsn = timeline_info.last_record_lsn;
@@ -496,7 +495,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
None,
None,
Some(pg_version),
None,
)?;
let new_timeline_id = timeline_info.timeline_id;
@@ -584,7 +582,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
start_lsn,
Some(ancestor_timeline_id),
None,
None,
)?;
let new_timeline_id = timeline_info.timeline_id;
@@ -611,9 +608,11 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
};
let mut cplane = ComputeControlPlane::load(env.clone())?;
// All subcommands take an optional --tenant-id option
let tenant_id = get_tenant_id(sub_args, env)?;
match sub_name {
"list" => {
let tenant_id = get_tenant_id(sub_args, env)?;
let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
eprintln!("Failed to load timeline info: {}", e);
HashMap::new()
@@ -673,7 +672,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
println!("{table}");
}
"create" => {
let tenant_id = get_tenant_id(sub_args, env)?;
let branch_name = sub_args
.get_one::<String>("branch-name")
.map(|s| s.as_str())
@@ -718,18 +716,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
};
match (mode, hot_standby) {
(ComputeMode::Static(_), true) => {
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
}
(ComputeMode::Primary, true) => {
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
}
_ => {}
}
cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
cplane.new_endpoint(
&endpoint_id,
tenant_id,
@@ -742,6 +728,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
)?;
}
"start" => {
let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
let endpoint_id = sub_args
.get_one::<String>("endpoint_id")
.ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
@@ -770,28 +758,80 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
env.safekeepers.iter().map(|sk| sk.id).collect()
};
let endpoint = cplane
.endpoints
.get(endpoint_id.as_str())
.ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
cplane.check_conflicting_endpoints(
endpoint.mode,
endpoint.tenant_id,
endpoint.timeline_id,
)?;
let endpoint = cplane.endpoints.get(endpoint_id.as_str());
let ps_conf = env.get_pageserver_conf(pageserver_id)?;
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
let claims = Claims::new(Some(tenant_id), Scope::Tenant);
Some(env.generate_auth_token(&claims)?)
} else {
None
};
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
let hot_standby = sub_args
.get_one::<bool>("hot-standby")
.copied()
.unwrap_or(false);
if let Some(endpoint) = endpoint {
match (&endpoint.mode, hot_standby) {
(ComputeMode::Static(_), true) => {
bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
}
(ComputeMode::Primary, true) => {
bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
}
_ => {}
}
println!("Starting existing endpoint {endpoint_id}...");
endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
} else {
let branch_name = sub_args
.get_one::<String>("branch-name")
.map(|s| s.as_str())
.unwrap_or(DEFAULT_BRANCH_NAME);
let timeline_id = env
.get_branch_timeline_id(branch_name, tenant_id)
.ok_or_else(|| {
anyhow!("Found no timeline id for branch name '{branch_name}'")
})?;
let lsn = sub_args
.get_one::<String>("lsn")
.map(|lsn_str| Lsn::from_str(lsn_str))
.transpose()
.context("Failed to parse Lsn from the request")?;
let pg_version = sub_args
.get_one::<u32>("pg-version")
.copied()
.context("Failed to `pg-version` from the argument string")?;
let mode = match (lsn, hot_standby) {
(Some(lsn), false) => ComputeMode::Static(lsn),
(None, true) => ComputeMode::Replica,
(None, false) => ComputeMode::Primary,
(Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
};
// when used with custom port this results in non obvious behaviour
// port is remembered from first start command, i e
// start --port X
// stop
// start <-- will also use port X even without explicit port argument
println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
let ep = cplane.new_endpoint(
endpoint_id,
tenant_id,
timeline_id,
pg_port,
http_port,
pg_version,
mode,
pageserver_id,
)?;
ep.start(&auth_token, safekeepers, remote_ext_config)?;
}
}
"reconfigure" => {
let endpoint_id = sub_args
@@ -1397,7 +1437,15 @@ fn cli() -> Command {
.subcommand(Command::new("start")
.about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
.arg(endpoint_id_arg.clone())
.arg(tenant_id_arg.clone())
.arg(branch_name_arg.clone())
.arg(timeline_id_arg.clone())
.arg(lsn_arg)
.arg(pg_port_arg)
.arg(http_port_arg)
.arg(endpoint_pageserver_id_arg.clone())
.arg(pg_version_arg)
.arg(hot_standby_arg)
.arg(safekeepers_arg)
.arg(remote_ext_config_args)
)
@@ -1410,6 +1458,7 @@ fn cli() -> Command {
.subcommand(
Command::new("stop")
.arg(endpoint_id_arg)
.arg(tenant_id_arg.clone())
.arg(
Arg::new("destroy")
.help("Also delete data directory (now optional, should be default in future)")

View File

@@ -125,7 +125,6 @@ impl ComputeControlPlane {
let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
let pageserver =
PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
@@ -170,30 +169,6 @@ impl ComputeControlPlane {
Ok(ep)
}
pub fn check_conflicting_endpoints(
&self,
mode: ComputeMode,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<()> {
if matches!(mode, ComputeMode::Primary) {
// this check is not complete, as you could have a concurrent attempt at
// creating another primary, both reading the state before checking it here,
// but it's better than nothing.
let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
v.tenant_id == tenant_id
&& v.timeline_id == timeline_id
&& v.mode == mode
&& v.status() != "stopped"
});
if let Some((key, _)) = duplicates.next() {
bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
}
}
Ok(())
}
}
///////////////////////////////////////////////////////////////////////////////
@@ -519,7 +494,6 @@ impl Endpoint {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
features: vec![],
cluster: Cluster {
cluster_id: None, // project ID: not used
name: None, // project name: not used

View File

@@ -11,7 +11,6 @@ use std::io::{BufReader, Write};
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::{Child, Command};
use std::time::Duration;
use std::{io, result};
use anyhow::{bail, Context};
@@ -523,24 +522,19 @@ impl PageServerNode {
&self,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<Duration>,
) -> anyhow::Result<()> {
let req_body = TenantLocationConfigRequest { tenant_id, config };
let path = format!(
"{}/tenant/{}/location_config",
self.http_base_url, tenant_id
);
let path = if let Some(flush_ms) = flush_ms {
format!("{}?flush_ms={}", path, flush_ms.as_millis())
} else {
path
};
self.http_request(Method::PUT, path)?
.json(&req_body)
.send()?
.error_from_body()?;
self.http_request(
Method::PUT,
format!(
"{}/tenant/{}/location_config",
self.http_base_url, tenant_id
),
)?
.json(&req_body)
.send()?
.error_from_body()?;
Ok(())
}
@@ -565,7 +559,6 @@ impl PageServerNode {
ancestor_start_lsn: Option<Lsn>,
ancestor_timeline_id: Option<TimelineId>,
pg_version: Option<u32>,
existing_initdb_timeline_id: Option<TimelineId>,
) -> anyhow::Result<TimelineInfo> {
// If timeline ID was not specified, generate one
let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
@@ -579,7 +572,6 @@ impl PageServerNode {
ancestor_start_lsn,
ancestor_timeline_id,
pg_version,
existing_initdb_timeline_id,
})
.send()?
.error_from_body()?

View File

@@ -117,7 +117,7 @@ pub fn migrate_tenant(
println!("🔁 Already attached to {origin_ps_id}, freshening...");
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps.location_config(tenant_id, dest_conf, None)?;
dest_ps.location_config(tenant_id, dest_conf)?;
println!("✅ Migration complete");
return Ok(());
}
@@ -126,7 +126,7 @@ pub fn migrate_tenant(
let stale_conf =
build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
origin_ps.location_config(tenant_id, stale_conf)?;
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
}
@@ -135,7 +135,7 @@ pub fn migrate_tenant(
let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
dest_ps.location_config(tenant_id, dest_conf, None)?;
dest_ps.location_config(tenant_id, dest_conf)?;
if let Some(baseline) = baseline_lsns {
println!("🕑 Waiting for LSN to catch up...");
@@ -181,7 +181,7 @@ pub fn migrate_tenant(
"💤 Switching to secondary mode on pageserver {}",
other_ps.conf.id
);
other_ps.location_config(tenant_id, secondary_conf, None)?;
other_ps.location_config(tenant_id, secondary_conf)?;
}
println!(
@@ -189,7 +189,7 @@ pub fn migrate_tenant(
dest_ps.conf.id
);
let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
dest_ps.location_config(tenant_id, dest_conf, None)?;
dest_ps.location_config(tenant_id, dest_conf)?;
println!("✅ Migration complete");

View File

@@ -1,205 +0,0 @@
# Name
Created on: 2023-09-08
Author: Arpad Müller
## Summary
Enable the pageserver to recover from data corruption events by implementing
a feature to re-apply historic WAL records in parallel to the already occurring
WAL replay.
The feature is outside of the user-visible backup and history story, and only
serves as a second-level backup for the case that there is a bug in the
pageservers that corrupted the served pages.
The RFC proposes the addition of two new features:
* recover a broken branch from WAL (downtime is allowed)
* a test recovery system to recover random branches to make sure recovery works
## Motivation
The historic WAL is currently stored in S3 even after it has been replayed by
the pageserver and thus been integrated into the pageserver's storage system.
This is done to defend from data corruption failures inside the pageservers.
However, application of this WAL in the disaster recovery setting is currently
very manual and we want to automate this to make it easier.
### Use cases
There are various use cases for this feature, like:
* The main motivation is replaying in the instance of pageservers corrupting
data.
* We might want to, beyond the user-visible history features, through our
support channels and upon customer request, in select instances, recover
historic versions beyond the range of history that we officially support.
* Running the recovery process in the background for random tenant timelines
to figure out if there was a corruption of data (we would compare with what
the pageserver stores for the "official" timeline).
* Using the WAL to arrive at historic pages we can then back up to S3 so that
WAL itself can be discarded, or at least not used for future replays.
Again, this sounds a lot like what the pageserver is already doing, but the
point is to provide a fallback to the service provided by the pageserver.
## Design
### Design constraints
The main design constraint is that the feature needs to be *simple* enough that
the number of bugs are as low, and reliability as high as possible: the main
goal of this endeavour is to achieve higher correctness than the pageserver.
For the background process, we cannot afford a downtime of the timeline that is
being cloned, as we don't want to restrict ourselves to offline tenants only.
In the scenario where we want to recover from disasters or roll back to a
historic lsn through support staff, downtimes are more affordable, and
inevitable if the original had been subject to the corruption. Ideally, the
two code paths would share code, so the solution would be designed for not
requiring downtimes.
### API endpoint changes
This RFC proposes two API endpoint changes in the safekeeper and the
pageserver.
Remember, the pageserver timeline API creation endpoint is to this URL:
```
/v1/tenant/{tenant_id}/timeline/
```
Where `{tenant_id}` is the ID of the tenant the timeline is created for,
and specified as part of the URL. The timeline ID is passed via the POST
request body as the only required parameter `new_timeline_id`.
This proposal adds one optional parameter called
`existing_initdb_timeline_id` to the request's json body. If the parameter
is not specified, behaviour should be as existing, so the pageserver runs
initdb.
If the parameter is specified, it is expected to point to a timeline ID.
In fact that ID might match `new_timeline_id`, what's important is that
S3 storage contains a matching initdb under the URL matching the given
tenant and timeline.
Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
specified is illegal and will yield in an HTTP error. This feature is
only meant for the "main" branch that doesn't have any ancestors
of its own, as only here initdb is relevant.
For the safekeeper, we propose the addition of the following copy endpoint:
```
/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
```
it is meant for POST requests with json, and the two URL parameters
`tenant_id` and `source_timeline_id`. The json request body contains
the two required parameters `target_timeline_id` and `until_lsn`.
After invoking, the copy endpoint starts a copy process of the WAL from
the source ID to the target ID. The lsn is updated according to the
progress of the API call.
### Higher level features
We want the API changes to support the following higher level features:
* recovery-after-corruption DR of the main timeline of a tenant. This
feature allows for downtime.
* test DR of the main timeline into a special copy timeline. this feature
is meant to run against selected production tenants in the background,
without the user noticing, so it does not allow for downtime.
The recovery-after-corruption DR only needs the pageserver changes.
It works as follows:
* delete the timeline from the pageservers via timeline deletion API
* re-create it via timeline creation API (same ID as before) and set
`existing_initdb_timeline_id` to the same timeline ID
The test DR requires also the copy primitive and works as follows:
* copy the WAL of the timeline to a new place
* create a new timeline for the tenant
## Non Goals
At the danger of being repetitive, the main goal of this feature is to be a
backup method, so reliability is very important. This implies that other
aspects like performance or space reduction are less important.
### Corrupt WAL
The process suggested by this RFC assumes that the WAL is free of corruption.
In some instances, corruption can make it into WAL, like for example when
higher level components like postgres or the application first read corrupt
data, and then execute a write with data derived from that earlier read. That
written data might then contain the corruption.
Common use cases can hit this quite easily. For example, an application reads
some counter, increments it, and then writes the new counter value to the
database.
On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
which have corrupt data for rows unrelated to the write operation at hand.
Separating corrupt writes from non-corrupt ones is a hard problem in general,
and if the application was involved in making the corrupt write, a recovery
would also involve the application. Therefore, corruption that has made it into
the WAL is outside of the scope of this feature. However, the WAL replay can be
issued to right before the point in time where the corruption occured. Then the
data loss is isolated to post-corruption writes only.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
Most changes would happen to the pageservers.
For the higher level features, maybe other components like the console would
be involved.
We need to make sure that the shadow timelines are not subject to the usual
limits and billing we apply to existing timelines.
## Proposed implementation
The first problem to keep in mind is the reproducability of `initdb`.
So an initial step would be to upload `initdb` snapshots to S3.
After that, we'd have the endpoint spawn a background process which
performs the replay of the WAL to that new timeline. This process should
follow the existing workflows as closely as possible, just using the
WAL records of a different timeline.
The timeline created will be in a special state that solely looks for WAL
entries of the timeline it is trying to copy. Once the target LSN is reached,
it turns into a normal timeline that also accepts writes to its own
timeline ID.
### Scalability
For now we want to run this entire process on a single node, and as
it is by nature linear, it's hard to parallelize. However, for the
verification workloads, we can easily start the WAL replay in parallel
for different points in time. This is valuable especially for tenants
with large WAL records.
Compare this with the tricks to make addition circuits execute with
lower latency by making them perform the addition for both possible
values of the carry bit, and then, in a second step, taking the
result for the carry bit that was actually obtained.
The other scalability dimension to consider is the WAL length, which
is a growing question as tenants accumulate changes. There are
possible approaches to this, including creating snapshots of the
page files and uploading them to S3, but if we do this for every single
branch, we lose the cheap branching property.
### Implementation by component
The proposed changes for the various components of the neon architecture
are written up in this notion page:
https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
### Unresolved questions
none known (outside of the mentioned ones).

View File

@@ -26,13 +26,6 @@ pub struct ComputeSpec {
// but we don't use it for anything. Serde will ignore missing fields when
// deserializing it.
pub operation_uuid: Option<String>,
/// Compute features to enable. These feature flags are provided, when we
/// know all the details about client's compute, so they cannot be used
/// to change `Empty` compute behavior.
#[serde(default)]
pub features: Vec<ComputeFeature>,
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
@@ -75,19 +68,6 @@ pub struct ComputeSpec {
pub remote_extensions: Option<RemoteExtSpec>,
}
/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeFeature {
// XXX: Add more feature flags here.
// This is a special feature flag that is used to represent unknown feature flags.
// Basically all unknown to enum flags are represented as this one. See unit test
// `parse_unknown_features()` for more details.
#[serde(other)]
UnknownFeature,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct RemoteExtSpec {
pub public_extensions: Option<Vec<String>>,
@@ -249,10 +229,7 @@ mod tests {
#[test]
fn parse_spec_file() {
let file = File::open("tests/cluster_spec.json").unwrap();
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
// Features list defaults to empty vector.
assert!(spec.features.is_empty());
let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
}
#[test]
@@ -264,22 +241,4 @@ mod tests {
ob.insert("unknown_field_123123123".into(), "hello".into());
let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
}
#[test]
fn parse_unknown_features() {
// Test that unknown feature flags do not cause any errors.
let file = File::open("tests/cluster_spec.json").unwrap();
let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
let ob = json.as_object_mut().unwrap();
// Add unknown feature flags.
let features = vec!["foo_bar_feature", "baz_feature"];
ob.insert("features".into(), features.into());
let spec: ComputeSpec = serde_json::from_value(json).unwrap();
assert!(spec.features.len() == 2);
assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
}
}

View File

@@ -4,9 +4,7 @@
//! See docs/rfcs/025-generation-numbers.md
use serde::{Deserialize, Serialize};
use utils::id::NodeId;
use crate::shard::TenantShardId;
use utils::id::{NodeId, TenantId};
#[derive(Serialize, Deserialize)]
pub struct ReAttachRequest {
@@ -15,7 +13,7 @@ pub struct ReAttachRequest {
#[derive(Serialize, Deserialize)]
pub struct ReAttachResponseTenant {
pub id: TenantShardId,
pub id: TenantId,
pub gen: u32,
}
@@ -26,7 +24,7 @@ pub struct ReAttachResponse {
#[derive(Serialize, Deserialize)]
pub struct ValidateRequestTenant {
pub id: TenantShardId,
pub id: TenantId,
pub gen: u32,
}
@@ -42,6 +40,6 @@ pub struct ValidateResponse {
#[derive(Serialize, Deserialize)]
pub struct ValidateResponseTenant {
pub id: TenantShardId,
pub id: TenantId,
pub valid: bool,
}

View File

@@ -140,7 +140,3 @@ impl Key {
})
}
}
pub fn is_rel_block_key(key: &Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}

View File

@@ -179,8 +179,6 @@ pub struct TimelineCreateRequest {
#[serde(default)]
pub ancestor_timeline_id: Option<TimelineId>,
#[serde(default)]
pub existing_initdb_timeline_id: Option<TimelineId>,
#[serde(default)]
pub ancestor_start_lsn: Option<Lsn>,
pub pg_version: Option<u32>,
}
@@ -316,7 +314,25 @@ impl std::ops::Deref for TenantConfigRequest {
impl TenantConfigRequest {
pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
let config = TenantConfig::default();
let config = TenantConfig {
checkpoint_distance: None,
checkpoint_timeout: None,
compaction_target_size: None,
compaction_period: None,
compaction_threshold: None,
gc_horizon: None,
gc_period: None,
image_creation_threshold: None,
pitr_interval: None,
walreceiver_connect_timeout: None,
lagging_wal_timeout: None,
max_lsn_wal_lag: None,
trace_read_requests: None,
eviction_policy: None,
min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: None,
gc_feedback: None,
};
TenantConfigRequest { tenant_id, config }
}
}
@@ -384,9 +400,7 @@ pub struct TimelineInfo {
/// The LSN that we are advertizing to safekeepers
pub remote_consistent_lsn_visible: Lsn,
pub current_logical_size: u64,
pub current_logical_size_is_accurate: bool,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
/// Sum of the size of all layer files.
/// If a layer is present in both local FS and S3, it counts only once.
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded

View File

@@ -1,15 +1,14 @@
use std::{ops::RangeInclusive, str::FromStr};
use crate::key::{is_rel_block_key, Key};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use thiserror;
use utils::id::TenantId;
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
pub struct ShardNumber(pub u8);
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
pub struct ShardCount(pub u8);
impl ShardCount {
@@ -40,7 +39,7 @@ impl ShardNumber {
/// Note that the binary encoding is _not_ backward compatible, because
/// at the time sharding is introduced, there are no existing binary structures
/// containing TenantId that we need to handle.
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
pub struct TenantShardId {
pub tenant_id: TenantId,
pub shard_number: ShardNumber,
@@ -73,28 +72,19 @@ impl TenantShardId {
)
}
pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
ShardSlug(self)
}
}
/// Formatting helper
struct ShardSlug<'a>(&'a TenantShardId);
impl<'a> std::fmt::Display for ShardSlug<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{:02x}{:02x}",
self.0.shard_number.0, self.0.shard_count.0
)
pub fn shard_slug(&self) -> String {
format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
}
}
impl std::fmt::Display for TenantShardId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
if self.shard_count != ShardCount(0) {
write!(f, "{}-{}", self.tenant_id, self.shard_slug())
write!(
f,
"{}-{:02x}{:02x}",
self.tenant_id, self.shard_number.0, self.shard_count.0
)
} else {
// Legacy case (shard_count == 0) -- format as just the tenant id. Note that this
// is distinct from the normal single shard case (shard count == 1).
@@ -312,8 +302,6 @@ pub struct ShardStripeSize(pub u32);
pub struct ShardLayout(u8);
const LAYOUT_V1: ShardLayout = ShardLayout(1);
/// ShardIdentity uses a magic layout value to indicate if it is unusable
const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
/// Default stripe size in pages: 256MiB divided by 8kiB page size.
const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
@@ -322,10 +310,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
/// to resolve a key to a shard, and then check whether that shard is ==self.
#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
pub struct ShardIdentity {
pub layout: ShardLayout,
pub number: ShardNumber,
pub count: ShardCount,
stripe_size: ShardStripeSize,
layout: ShardLayout,
pub stripe_size: ShardStripeSize,
}
#[derive(thiserror::Error, Debug, PartialEq, Eq)]
@@ -351,22 +339,6 @@ impl ShardIdentity {
}
}
/// A broken instance of this type is only used for `TenantState::Broken` tenants,
/// which are constructed in code paths that don't have access to proper configuration.
///
/// A ShardIdentity in this state may not be used for anything, and should not be persisted.
/// Enforcement is via assertions, to avoid making our interface fallible for this
/// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
/// state, and by extension to avoid trying to do any page->shard resolution.
pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
Self {
number,
count,
layout: LAYOUT_BROKEN,
stripe_size: DEFAULT_STRIPE_SIZE,
}
}
pub fn is_unsharded(&self) -> bool {
self.number == ShardNumber(0) && self.count == ShardCount(0)
}
@@ -393,33 +365,6 @@ impl ShardIdentity {
})
}
}
fn is_broken(&self) -> bool {
self.layout == LAYOUT_BROKEN
}
pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
assert!(!self.is_broken());
key_to_shard_number(self.count, self.stripe_size, key)
}
/// Return true if the key should be ingested by this shard
pub fn is_key_local(&self, key: &Key) -> bool {
assert!(!self.is_broken());
if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
true
} else {
key_to_shard_number(self.count, self.stripe_size, key) == self.number
}
}
pub fn shard_slug(&self) -> String {
if self.count > ShardCount(0) {
format!("-{:02x}{:02x}", self.number.0, self.count.0)
} else {
String::new()
}
}
}
impl Serialize for ShardIndex {
@@ -493,65 +438,6 @@ impl<'de> Deserialize<'de> for ShardIndex {
}
}
/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
/// in order to be able to serve basebackup requests without peer communication).
fn key_is_shard0(key: &Key) -> bool {
// To decide what to shard out to shards >0, we apply a simple rule that only
// relation pages are distributed to shards other than shard zero. Everything else gets
// stored on shard 0. This guarantees that shard 0 can independently serve basebackup
// requests, and any request other than those for particular blocks in relations.
//
// In this condition:
// - is_rel_block_key includes only relations, i.e. excludes SLRU data and
// all metadata.
// - field6 is set to -1 for relation size pages.
!(is_rel_block_key(key) && key.field6 != 0xffffffff)
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name
fn murmurhash32(mut h: u32) -> u32 {
h ^= h >> 16;
h = h.wrapping_mul(0x85ebca6b);
h ^= h >> 13;
h = h.wrapping_mul(0xc2b2ae35);
h ^= h >> 16;
h
}
/// Provide the same result as the function in postgres `hashfn.h` with the same name
fn hash_combine(mut a: u32, mut b: u32) -> u32 {
b = b.wrapping_add(0x9e3779b9);
b = b.wrapping_add(a << 6);
b = b.wrapping_add(a >> 2);
a ^= b;
a
}
/// Where a Key is to be distributed across shards, select the shard. This function
/// does not account for keys that should be broadcast across shards.
///
/// The hashing in this function must exactly match what we do in postgres smgr
/// code. The resulting distribution of pages is intended to preserve locality within
/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
/// distributing data pseudo-randomly.
///
/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
/// and will be handled at higher levels when shards are split.
fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
// Fast path for un-sharded tenants or broadcast keys
if count < ShardCount(2) || key_is_shard0(key) {
return ShardNumber(0);
}
// relNode
let mut hash = murmurhash32(key.field4);
// blockNum/stripe size
hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
ShardNumber((hash % count.0 as u32) as u8)
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
@@ -723,29 +609,4 @@ mod tests {
Ok(())
}
// These are only smoke tests to spot check that our implementation doesn't
// deviate from a few examples values: not aiming to validate the overall
// hashing algorithm.
#[test]
fn murmur_hash() {
assert_eq!(murmurhash32(0), 0);
assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
}
#[test]
fn shard_mapping() {
let key = Key {
field1: 0x00,
field2: 0x67f,
field3: 0x5,
field4: 0x400c,
field5: 0x00,
field6: 0x7d06,
};
let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
assert_eq!(shard, ShardNumber(8));
}
}

View File

@@ -289,10 +289,10 @@ impl FeStartupPacket {
// We shouldn't advance `buf` as probably full message is not there yet,
// so can't directly use Bytes::get_u32 etc.
let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
// The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
// The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
// which is less readable
#[allow(clippy::manual_range_contains)]
if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
return Err(ProtocolError::Protocol(format!(
"invalid startup packet message length {}",
len
@@ -975,10 +975,4 @@ mod tests {
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
}
#[test]
fn parse_fe_startup_packet_regression() {
let data = [0, 0, 0, 7, 0, 0, 0, 0];
FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
}
}

View File

@@ -9,7 +9,8 @@ anyhow.workspace = true
async-trait.workspace = true
once_cell.workspace = true
aws-smithy-async.workspace = true
aws-smithy-types.workspace = true
aws-smithy-http.workspace = true
aws-types.workspace = true
aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-credential-types.workspace = true

View File

@@ -14,20 +14,18 @@ use aws_config::{
provider_config::ProviderConfig,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
BehaviorVersion,
};
use aws_credential_types::provider::SharedCredentialsProvider;
use aws_credential_types::cache::CredentialsCache;
use aws_sdk_s3::{
config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
config::{AsyncSleep, Config, Region, SharedAsyncSleep},
error::SdkError,
operation::get_object::GetObjectError,
primitives::ByteStream,
types::{Delete, ObjectIdentifier},
Client,
};
use aws_smithy_async::rt::sleep::TokioSleep;
use aws_smithy_types::body::SdkBody;
use aws_smithy_types::byte_stream::ByteStream;
use aws_smithy_http::body::SdkBody;
use hyper::Body;
use scopeguard::ScopeGuard;
use tokio::io::{self, AsyncRead};
@@ -80,6 +78,7 @@ impl S3Bucket {
// needed to access remote extensions bucket
.or_else("token", {
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build()
@@ -99,20 +98,18 @@ impl S3Bucket {
.set_max_attempts(Some(1))
.set_mode(Some(RetryMode::Adaptive));
let mut config_builder = Builder::default()
.behavior_version(BehaviorVersion::v2023_11_09())
let mut config_builder = Config::builder()
.region(region)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.retry_config(retry_config.build())
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
.credentials_cache(CredentialsCache::lazy())
.credentials_provider(credentials_provider)
.sleep_impl(SharedAsyncSleep::from(sleep_impl))
.retry_config(retry_config.build());
if let Some(custom_endpoint) = aws_config.endpoint.clone() {
config_builder = config_builder
.endpoint_url(custom_endpoint)
.force_path_style(true);
}
let client = Client::from_conf(config_builder.build());
let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
@@ -374,11 +371,11 @@ impl RemoteStorage for S3Bucket {
let response = response?;
let keys = response.contents();
let keys = response.contents().unwrap_or_default();
let empty = Vec::new();
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
for object in keys {
let object_path = object.key().expect("response does not contain a key");
@@ -414,7 +411,7 @@ impl RemoteStorage for S3Bucket {
let started_at = start_measuring_requests(kind);
let body = Body::wrap_stream(ReaderStream::new(from));
let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
let bytes_stream = ByteStream::new(SdkBody::from(body));
let res = self
.client
@@ -477,7 +474,7 @@ impl RemoteStorage for S3Bucket {
for path in paths {
let obj_id = ObjectIdentifier::builder()
.set_key(Some(self.relative_path_to_s3_object(path)))
.build()?;
.build();
delete_objects.push(obj_id);
}
@@ -488,11 +485,7 @@ impl RemoteStorage for S3Bucket {
.client
.delete_objects()
.bucket(self.bucket_name.clone())
.delete(
Delete::builder()
.set_objects(Some(chunk.to_vec()))
.build()?,
)
.delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
.send()
.await;

View File

@@ -152,16 +152,3 @@ impl Debug for Generation {
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn generation_gt() {
// Important that a None generation compares less than a valid one, during upgrades from
// pre-generation systems.
assert!(Generation::none() < Generation::new(0));
assert!(Generation::none() < Generation::new(1));
}
}

View File

@@ -1,10 +1,10 @@
//!
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
//! without blocking writers, and allows writing a new value without blocking
//! readers. When you update the value, the new value is immediately visible
//! without blocking writers, and allows writing a new values without blocking
//! readers. When you update the new value, the new value is immediately visible
//! to new readers, but the update waits until all existing readers have
//! finished, so that on return, no one sees the old value anymore.
//! finishe, so that no one sees the old value anymore.
//!
//! This implementation isn't wait-free; it uses an RwLock that is held for a
//! short duration when the value is read or updated.
@@ -26,7 +26,6 @@
//! Increment the value by one, and wait for old readers to finish:
//!
//! ```
//! # async fn dox() {
//! # let rcu = utils::simple_rcu::Rcu::new(1);
//! let write_guard = rcu.lock_for_write();
//!
@@ -37,17 +36,15 @@
//!
//! // Concurrent reads and writes are now possible again. Wait for all the readers
//! // that still observe the old value to finish.
//! waitlist.wait().await;
//! # }
//! waitlist.wait();
//! ```
//!
#![warn(missing_docs)]
use std::ops::Deref;
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
use std::sync::{Arc, Weak};
use std::sync::{RwLock, RwLockWriteGuard};
use tokio::sync::watch;
use std::sync::{Mutex, RwLock, RwLockWriteGuard};
///
/// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -71,21 +68,22 @@ struct RcuCell<V> {
value: V,
/// A dummy channel. We never send anything to this channel. The point is
/// that when the RcuCell is dropped, any subscribed Receivers will be notified
/// that when the RcuCell is dropped, any cloned Senders will be notified
/// that the channel is closed. Updaters can use this to wait out until the
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
///
/// We never send anything to this, we just need to hold onto it so that the
/// Receivers will be notified when it's dropped.
watch: watch::Sender<()>,
/// We never do anything with the receiver, we just need to hold onto it so
/// that the Senders will be notified when it's dropped. But because it's
/// not Sync, we need a Mutex on it.
watch: (SyncSender<()>, Mutex<Receiver<()>>),
}
impl<V> RcuCell<V> {
fn new(value: V) -> Self {
let (watch_sender, _) = watch::channel(());
let (watch_sender, watch_receiver) = sync_channel(0);
RcuCell {
value,
watch: watch_sender,
watch: (watch_sender, Mutex::new(watch_receiver)),
}
}
}
@@ -143,10 +141,10 @@ impl<V> Deref for RcuReadGuard<V> {
///
/// Write guard returned by `write`
///
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
/// held for a short duration!
/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
/// it should only be held for a short duration!
///
/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
/// Calling `store` consumes the guard, making new reads and new writes possible
/// again.
///
pub struct RcuWriteGuard<'a, V> {
@@ -181,7 +179,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
// the watches for any that do.
self.inner.old_cells.retain(|weak| {
if let Some(cell) = weak.upgrade() {
watches.push(cell.watch.subscribe());
watches.push(cell.watch.0.clone());
true
} else {
false
@@ -195,20 +193,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
///
/// List of readers who can still see old values.
///
pub struct RcuWaitList(Vec<watch::Receiver<()>>);
pub struct RcuWaitList(Vec<SyncSender<()>>);
impl RcuWaitList {
///
/// Wait for old readers to finish.
///
pub async fn wait(mut self) {
pub fn wait(mut self) {
// after all the old_cells are no longer in use, we're done
for w in self.0.iter_mut() {
// This will block until the Receiver is closed. That happens when
// the RcuCell is dropped.
#[allow(clippy::single_match)]
match w.changed().await {
Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
match w.send(()) {
Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
Err(_) => {
// closed, which means that the cell has been dropped, and
// its value is no longer in use
@@ -222,10 +220,11 @@ impl RcuWaitList {
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
use std::thread::{sleep, spawn};
use std::time::Duration;
#[tokio::test]
async fn two_writers() {
#[test]
fn two_writers() {
let rcu = Rcu::new(1);
let read1 = rcu.read();
@@ -249,35 +248,33 @@ mod tests {
assert_eq!(*read1, 1);
let log = Arc::new(Mutex::new(Vec::new()));
// Wait for the old readers to finish in separate tasks.
// Wait for the old readers to finish in separate threads.
let log_clone = Arc::clone(&log);
let task2 = tokio::spawn(async move {
wait2.wait().await;
let thread2 = spawn(move || {
wait2.wait();
log_clone.lock().unwrap().push("wait2 done");
});
let log_clone = Arc::clone(&log);
let task3 = tokio::spawn(async move {
wait3.wait().await;
let thread3 = spawn(move || {
wait3.wait();
log_clone.lock().unwrap().push("wait3 done");
});
// without this sleep the test can pass on accident if the writer is slow
tokio::time::sleep(Duration::from_millis(100)).await;
sleep(Duration::from_millis(500));
// Release first reader. This allows first write to finish, but calling
// wait() on the 'task3' would still block.
// wait() on the second one would still block.
log.lock().unwrap().push("dropping read1");
drop(read1);
task2.await.unwrap();
thread2.join().unwrap();
assert!(!task3.is_finished());
tokio::time::sleep(Duration::from_millis(100)).await;
sleep(Duration::from_millis(500));
// Release second reader, and finish second writer.
log.lock().unwrap().push("dropping read2");
drop(read2);
task3.await.unwrap();
thread3.join().unwrap();
assert_eq!(
log.lock().unwrap().as_slice(),

View File

@@ -51,7 +51,6 @@ regex.workspace = true
scopeguard.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
serde_path_to_error.workspace = true
serde_with.workspace = true
signal-hook.workspace = true
smallvec = { workspace = true, features = ["write"] }

View File

@@ -10,6 +10,7 @@ use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
use std::time::Instant;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -209,8 +210,13 @@ fn bench_sequential(c: &mut Criterion) {
for i in 0..100_000 {
let i32 = (i as u32) % 100;
let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
let layer =
PersistentLayerDesc::new_img(zero.add(10 * i32)..zero.add(10 * i32 + 1), Lsn(i), 0);
let layer = PersistentLayerDesc::new_img(
TenantId::generate(),
TimelineId::generate(),
zero.add(10 * i32)..zero.add(10 * i32 + 1),
Lsn(i),
0,
);
updates.insert_historic(layer);
}
updates.flush();

View File

@@ -1,15 +1,13 @@
use std::path::{Path, PathBuf};
use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8Path;
use clap::Subcommand;
use pageserver::context::{DownloadBehavior, RequestContext};
use pageserver::task_mgr::TaskKind;
use pageserver::tenant::block_io::BlockCursor;
use pageserver::tenant::disk_btree::DiskBtreeReader;
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
use pageserver::{page_cache, virtual_file};
use pageserver::{
@@ -22,7 +20,6 @@ use pageserver::{
};
use std::fs;
use utils::bin_ser::BeSer;
use utils::id::{TenantId, TimelineId};
use crate::layer_map_analyzer::parse_filename;
@@ -48,13 +45,6 @@ pub(crate) enum LayerCmd {
/// The id from list-layer command
id: usize,
},
RewriteSummary {
layer_file_path: Utf8PathBuf,
#[clap(long)]
new_tenant_id: Option<TenantId>,
#[clap(long)]
new_timeline_id: Option<TimelineId>,
},
}
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
@@ -110,7 +100,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
println!("- timeline {}", timeline.file_name().to_string_lossy());
}
}
Ok(())
}
LayerCmd::ListLayer {
path,
@@ -139,7 +128,6 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
idx += 1;
}
}
Ok(())
}
LayerCmd::DumpLayer {
path,
@@ -180,63 +168,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
idx += 1;
}
}
Ok(())
}
LayerCmd::RewriteSummary {
layer_file_path,
new_tenant_id,
new_timeline_id,
} => {
pageserver::virtual_file::init(10);
pageserver::page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
macro_rules! rewrite_closure {
($($summary_ty:tt)*) => {{
|summary| $($summary_ty)* {
tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
..summary
}
}};
}
let res = ImageLayer::rewrite_summary(
layer_file_path,
rewrite_closure!(image_layer::Summary),
&ctx,
)
.await;
match res {
Ok(()) => {
println!("Successfully rewrote summary of image layer {layer_file_path}");
return Ok(());
}
Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
Err(image_layer::RewriteSummaryError::Other(e)) => {
return Err(e);
}
}
let res = DeltaLayer::rewrite_summary(
layer_file_path,
rewrite_closure!(delta_layer::Summary),
&ctx,
)
.await;
match res {
Ok(()) => {
println!("Successfully rewrote summary of delta layer {layer_file_path}");
return Ok(());
}
Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
Err(delta_layer::RewriteSummaryError::Other(e)) => {
return Err(e);
}
}
anyhow::bail!("not an image or delta layer: {layer_file_path}");
}
}
Ok(())
}

View File

@@ -402,11 +402,15 @@ fn start_pageserver(
let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
let (init_done_tx, init_done_rx) = utils::completion::channel();
let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
let order = pageserver::InitializationOrder {
initial_tenant_load_remote: Some(init_done_tx),
initial_tenant_load: Some(init_remote_done_tx),
initial_logical_size_can_start: init_done_rx.clone(),
initial_logical_size_attempt: Some(init_logical_size_done_tx),
background_jobs_can_start: background_jobs_barrier.clone(),
};
@@ -460,7 +464,7 @@ fn start_pageserver(
});
let WaitForPhaseResult {
timeout_remaining: _timeout,
timeout_remaining: timeout,
skipped: init_load_skipped,
} = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
@@ -468,6 +472,26 @@ fn start_pageserver(
scopeguard::ScopeGuard::into_inner(guard);
let guard = scopeguard::guard_on_success((), |_| {
tracing::info!("Cancelled before initial logical sizes completed")
});
let logical_sizes_done = std::pin::pin!(async {
init_logical_size_done_rx.wait().await;
startup_checkpoint(
started_startup_at,
"initial_logical_sizes",
"Initial logical sizes completed",
);
});
let WaitForPhaseResult {
timeout_remaining: _,
skipped: logical_sizes_skipped,
} = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
scopeguard::ScopeGuard::into_inner(guard);
// allow background jobs to start: we either completed prior stages, or they reached timeout
// and were skipped. It is important that we do not let them block background jobs indefinitely,
// because things like consumption metrics for billing are blocked by this barrier.
@@ -490,6 +514,9 @@ fn start_pageserver(
if let Some(f) = init_load_skipped {
f.await;
}
if let Some(f) = logical_sizes_skipped {
f.await;
}
scopeguard::ScopeGuard::into_inner(guard);
startup_checkpoint(started_startup_at, "complete", "Startup complete");

View File

@@ -5,7 +5,6 @@
//! See also `settings.md` for better description on every parameter.
use anyhow::{anyhow, bail, ensure, Context, Result};
use pageserver_api::shard::TenantShardId;
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde::de::IntoDeserializer;
use std::env;
@@ -26,7 +25,7 @@ use toml_edit::{Document, Item};
use camino::{Utf8Path, Utf8PathBuf};
use postgres_backend::AuthType;
use utils::{
id::{NodeId, TimelineId},
id::{NodeId, TenantId, TimelineId},
logging::LogFormat,
};
@@ -629,13 +628,12 @@ impl PageServerConf {
self.deletion_prefix().join(format!("header-{VERSION:02x}"))
}
pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenants_path().join(tenant_shard_id.to_string())
pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenants_path().join(tenant_id.to_string())
}
pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(IGNORED_TENANT_FILE_NAME)
pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
}
/// Points to a place in pageserver's local directory,
@@ -643,53 +641,47 @@ impl PageServerConf {
///
/// Legacy: superseded by tenant_location_config_path. Eventually
/// remove this function.
pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
}
pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_LOCATION_CONFIG_NAME)
}
pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(TIMELINES_SEGMENT_NAME)
pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
}
pub fn timeline_path(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Utf8PathBuf {
self.timelines_path(tenant_shard_id)
.join(timeline_id.to_string())
pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
self.timelines_path(tenant_id).join(timeline_id.to_string())
}
pub fn timeline_uninit_mark_file_path(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_shard_id, &timeline_id),
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_UNINIT_MARK_SUFFIX,
)
}
pub fn timeline_delete_mark_file_path(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Utf8PathBuf {
path_with_suffix_extension(
self.timeline_path(&tenant_shard_id, &timeline_id),
self.timeline_path(&tenant_id, &timeline_id),
TIMELINE_DELETE_MARK_SUFFIX,
)
}
pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
self.tenant_path(tenant_id)
.join(TENANT_DELETED_MARKER_FILE_NAME)
}
@@ -699,24 +691,20 @@ impl PageServerConf {
pub fn trace_path(
&self,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
connection_id: &ConnectionId,
) -> Utf8PathBuf {
self.traces_path()
.join(tenant_shard_id.to_string())
.join(tenant_id.to_string())
.join(timeline_id.to_string())
.join(connection_id.to_string())
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Utf8PathBuf {
self.timeline_path(tenant_shard_id, timeline_id)
pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
self.timeline_path(tenant_id, timeline_id)
.join(METADATA_FILE_NAME)
}
@@ -779,7 +767,7 @@ impl PageServerConf {
builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
}
"tenant_config" => {
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
t_conf = Self::parse_toml_tenant_conf(item)?;
}
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
@@ -853,10 +841,114 @@ impl PageServerConf {
Ok(conf)
}
// subroutine of parse_and_validate to parse `[tenant_conf]` section
pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
let mut t_conf: TenantConfOpt = Default::default();
if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
t_conf.checkpoint_distance =
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
}
if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
t_conf.checkpoint_timeout = Some(parse_toml_duration(
"checkpoint_timeout",
checkpoint_timeout,
)?);
}
if let Some(compaction_target_size) = item.get("compaction_target_size") {
t_conf.compaction_target_size = Some(parse_toml_u64(
"compaction_target_size",
compaction_target_size,
)?);
}
if let Some(compaction_period) = item.get("compaction_period") {
t_conf.compaction_period =
Some(parse_toml_duration("compaction_period", compaction_period)?);
}
if let Some(compaction_threshold) = item.get("compaction_threshold") {
t_conf.compaction_threshold =
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
}
if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
t_conf.image_creation_threshold = Some(
parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
);
}
if let Some(gc_horizon) = item.get("gc_horizon") {
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
}
if let Some(gc_period) = item.get("gc_period") {
t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
}
if let Some(pitr_interval) = item.get("pitr_interval") {
t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
}
if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
"walreceiver_connect_timeout",
walreceiver_connect_timeout,
)?);
}
if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
t_conf.lagging_wal_timeout = Some(parse_toml_duration(
"lagging_wal_timeout",
lagging_wal_timeout,
)?);
}
if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
t_conf.max_lsn_wal_lag =
Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
}
if let Some(trace_read_requests) = item.get("trace_read_requests") {
t_conf.trace_read_requests =
Some(trace_read_requests.as_bool().with_context(|| {
"configure option trace_read_requests is not a bool".to_string()
})?);
}
if let Some(eviction_policy) = item.get("eviction_policy") {
t_conf.eviction_policy = Some(
deserialize_from_item("eviction_policy", eviction_policy)
.context("parse eviction_policy")?,
);
}
if let Some(item) = item.get("min_resident_size_override") {
t_conf.min_resident_size_override = Some(
deserialize_from_item("min_resident_size_override", item)
.context("parse min_resident_size_override")?,
);
}
if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
"evictions_low_residence_duration_metric_threshold",
item,
)?);
}
if let Some(gc_feedback) = item.get("gc_feedback") {
t_conf.gc_feedback = Some(
gc_feedback
.as_bool()
.with_context(|| "configure option gc_feedback is not a bool".to_string())?,
);
}
Ok(t_conf)
}
#[cfg(test)]
pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
}
pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -1325,37 +1417,6 @@ trace_read_requests = {trace_read_requests}"#,
Ok(())
}
#[test]
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
let config_string = r#"
[tenant_config]
checkpoint_distance = -1 # supposed to be an u64
"#
.to_string();
let toml: Document = config_string.parse()?;
let item = toml.get("tenant_config").unwrap();
let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
assert_eq!(error.to_string(), expected_error_str);
Ok(())
}
#[test]
fn parse_override_tenant_config() -> anyhow::Result<()> {
let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string();
let toml: Document = config_string.parse()?;
let item = toml.get("tenant_config").unwrap();
let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
assert_eq!(conf.min_resident_size_override, Some(400));
Ok(())
}
#[test]
fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
let tempdir = tempdir()?;

View File

@@ -1,8 +1,8 @@
use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
use crate::context::RequestContext;
use anyhow::Context;
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
use pageserver_api::shard::ShardNumber;
use std::{sync::Arc, time::SystemTime};
use utils::{
id::{TenantId, TimelineId},
@@ -229,11 +229,6 @@ where
while let Some((tenant_id, tenant)) = tenants.next().await {
let mut tenant_resident_size = 0;
// Sharded tenants report all consumption metrics from shard zero
if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
continue;
}
for timeline in tenant.list_timelines() {
let timeline_id = timeline.timeline_id;
@@ -356,17 +351,14 @@ impl TimelineSnapshot {
let last_record_lsn = t.get_last_record_lsn();
let current_exact_logical_size = {
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
let size = span.in_scope(|| {
t.get_current_logical_size(
crate::tenant::timeline::GetLogicalSizePriority::Background,
ctx,
)
});
match size {
let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
let res = span
.in_scope(|| t.get_current_logical_size(ctx))
.context("get_current_logical_size");
match res? {
// Only send timeline logical size when it is fully calculated.
CurrentLogicalSize::Exact(ref size) => Some(size.into()),
CurrentLogicalSize::Approximate(_) => None,
(size, is_exact) if is_exact => Some(size),
(_, _) => None,
}
};

View File

@@ -1,15 +1,16 @@
use std::collections::HashMap;
use pageserver_api::{
control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
},
shard::TenantShardId,
use pageserver_api::control_api::{
ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
};
use serde::{de::DeserializeOwned, Serialize};
use tokio_util::sync::CancellationToken;
use url::Url;
use utils::{backoff, generation::Generation, id::NodeId};
use utils::{
backoff,
generation::Generation,
id::{NodeId, TenantId},
};
use crate::config::PageServerConf;
@@ -30,11 +31,11 @@ pub enum RetryForeverError {
#[async_trait::async_trait]
pub trait ControlPlaneGenerationsApi {
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
}
impl ControlPlaneClient {
@@ -126,7 +127,7 @@ impl ControlPlaneClient {
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("re-attach")
@@ -153,8 +154,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
/// Block until we get a successful response, or error out if we are shut down
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
let re_attach_path = self
.base_url
.join("validate")

View File

@@ -15,7 +15,7 @@ use crate::virtual_file::MaybeFatalIo;
use crate::virtual_file::VirtualFile;
use anyhow::Context;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use hex::FromHex;
use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::Deserialize;
use serde::Serialize;
@@ -26,7 +26,7 @@ use tracing::Instrument;
use tracing::{self, debug, error};
use utils::crashsafe::path_with_suffix_extension;
use utils::generation::Generation;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
use utils::lsn::AtomicLsn;
use utils::lsn::Lsn;
@@ -160,10 +160,11 @@ pub struct DeletionQueueClient {
lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Serialize, Deserialize)]
struct TenantDeletionList {
/// For each Timeline, a list of key fragments to append to the timeline remote path
/// when reconstructing a full key
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
timelines: HashMap<TimelineId, Vec<String>>,
/// The generation in which this deletion was emitted: note that this may not be the
@@ -178,11 +179,43 @@ impl TenantDeletionList {
}
}
/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
V: Serialize,
I: AsRef<[u8]>,
{
let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
transformed
.collect::<HashMap<String, &V>>()
.serialize(serializer)
}
/// For HashMaps using a FromHex key, where we would like to decode the key
fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
where
D: serde::de::Deserializer<'de>,
V: Deserialize<'de>,
I: FromHex + std::hash::Hash + Eq,
{
let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
hex_map
.into_iter()
.map(|(k, v)| {
I::from_hex(k)
.map(|k| (k, v))
.map_err(|_| serde::de::Error::custom("Invalid hex ID"))
})
.collect()
}
/// Files ending with this suffix will be ignored and erased
/// during recovery as startup.
const TEMP_SUFFIX: &str = "tmp";
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
#[derive(Debug, Serialize, Deserialize)]
struct DeletionList {
/// Serialization version, for future use
version: u8,
@@ -194,7 +227,8 @@ struct DeletionList {
/// nested HashMaps by TenantTimelineID. Each Tenant only appears once
/// with one unique generation ID: if someone tries to push a second generation
/// ID for the same tenant, we will start a new DeletionList.
tenants: HashMap<TenantShardId, TenantDeletionList>,
#[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
tenants: HashMap<TenantId, TenantDeletionList>,
/// Avoid having to walk `tenants` to calculate the number of keys in
/// the nested deletion lists
@@ -266,7 +300,7 @@ impl DeletionList {
/// deletion list.
fn push(
&mut self,
tenant: &TenantShardId,
tenant: &TenantId,
timeline: &TimelineId,
generation: Generation,
objects: &mut Vec<RemotePath>,
@@ -358,7 +392,7 @@ struct TenantLsnState {
#[derive(Default)]
struct VisibleLsnUpdates {
tenants: HashMap<TenantShardId, TenantLsnState>,
tenants: HashMap<TenantId, TenantLsnState>,
}
impl VisibleLsnUpdates {
@@ -415,7 +449,7 @@ impl DeletionQueueClient {
pub(crate) fn recover(
&self,
attached_tenants: HashMap<TenantShardId, Generation>,
attached_tenants: HashMap<TenantId, Generation>,
) -> Result<(), DeletionQueueError> {
self.do_push(
&self.tx,
@@ -432,7 +466,7 @@ impl DeletionQueueClient {
/// backend will later wake up and notice that the tenant's generation requires validation.
pub(crate) async fn update_remote_consistent_lsn(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
lsn: Lsn,
@@ -443,13 +477,10 @@ impl DeletionQueueClient {
.write()
.expect("Lock should never be poisoned");
let tenant_entry = locked
.tenants
.entry(tenant_shard_id)
.or_insert(TenantLsnState {
timelines: HashMap::new(),
generation: current_generation,
});
let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
timelines: HashMap::new(),
generation: current_generation,
});
if tenant_entry.generation != current_generation {
// Generation might have changed if we were detached and then re-attached: in this case,
@@ -476,7 +507,7 @@ impl DeletionQueueClient {
/// generations in `layers` are the generations in which those layers were written.
pub(crate) async fn push_layers(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
@@ -487,7 +518,7 @@ impl DeletionQueueClient {
let mut layer_paths = Vec::new();
for (layer, meta) in layers {
layer_paths.push(remote_layer_path(
&tenant_shard_id.tenant_id,
&tenant_id,
&timeline_id,
meta.shard,
&layer,
@@ -498,7 +529,7 @@ impl DeletionQueueClient {
return self.flush_immediate().await;
}
self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
}
/// When a Tenant has a generation, push_layers is always synchronous because
@@ -508,7 +539,7 @@ impl DeletionQueueClient {
/// support (`<https://github.com/neondatabase/neon/issues/5395>`)
pub(crate) fn push_layers_sync(
&self,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
current_generation: Generation,
layers: Vec<(LayerFileName, LayerFileMetadata)>,
@@ -519,7 +550,7 @@ impl DeletionQueueClient {
self.do_push(
&self.tx,
ListWriterQueueMessage::Delete(DeletionOp {
tenant_shard_id,
tenant_id,
timeline_id,
layers,
generation: current_generation,
@@ -787,12 +818,12 @@ mod test {
}
fn set_latest_generation(&self, gen: Generation) {
let tenant_shard_id = self.harness.tenant_shard_id;
let tenant_id = self.harness.tenant_id;
self.mock_control_plane
.latest_generation
.lock()
.unwrap()
.insert(tenant_shard_id, gen);
.insert(tenant_id, gen);
}
/// Returns remote layer file name, suitable for use in assert_remote_files
@@ -801,8 +832,8 @@ mod test {
file_name: LayerFileName,
gen: Generation,
) -> anyhow::Result<String> {
let tenant_shard_id = self.harness.tenant_shard_id;
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let tenant_id = self.harness.tenant_id;
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
std::fs::create_dir_all(&remote_timeline_path)?;
let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
@@ -820,7 +851,7 @@ mod test {
#[derive(Debug, Clone)]
struct MockControlPlane {
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
}
impl MockControlPlane {
@@ -834,20 +865,20 @@ mod test {
#[async_trait::async_trait]
impl ControlPlaneGenerationsApi for MockControlPlane {
#[allow(clippy::diverging_sub_expression)] // False positive via async_trait
async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
unimplemented!()
}
async fn validate(
&self,
tenants: Vec<(TenantShardId, Generation)>,
) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
tenants: Vec<(TenantId, Generation)>,
) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
let mut result = HashMap::new();
let latest_generation = self.latest_generation.lock().unwrap();
for (tenant_shard_id, generation) in tenants {
if let Some(latest) = latest_generation.get(&tenant_shard_id) {
result.insert(tenant_shard_id, *latest == generation);
for (tenant_id, generation) in tenants {
if let Some(latest) = latest_generation.get(&tenant_id) {
result.insert(tenant_id, *latest == generation);
}
}
@@ -951,10 +982,10 @@ mod test {
client.recover(HashMap::new())?;
let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
let tenant_shard_id = ctx.harness.tenant_shard_id;
let tenant_id = ctx.harness.tenant_id;
let content: Vec<u8> = "victim1 contents".into();
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
let deletion_prefix = ctx.harness.conf.deletion_prefix();
@@ -984,7 +1015,7 @@ mod test {
info!("Pushing");
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
now_generation,
[(layer_file_name_1.clone(), layer_metadata)].to_vec(),
@@ -1031,8 +1062,8 @@ mod test {
ctx.set_latest_generation(latest_generation);
let tenant_shard_id = ctx.harness.tenant_shard_id;
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let tenant_id = ctx.harness.tenant_id;
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
// Initial state: a remote layer exists
@@ -1042,7 +1073,7 @@ mod test {
tracing::debug!("Pushing...");
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
stale_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
@@ -1057,7 +1088,7 @@ mod test {
tracing::debug!("Pushing...");
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
latest_generation,
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
@@ -1079,9 +1110,9 @@ mod test {
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
let tenant_shard_id = ctx.harness.tenant_shard_id;
let tenant_id = ctx.harness.tenant_id;
let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
let deletion_prefix = ctx.harness.conf.deletion_prefix();
@@ -1097,7 +1128,7 @@ mod test {
ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
now_generation.previous(),
[(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
@@ -1111,7 +1142,7 @@ mod test {
ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
client
.push_layers(
tenant_shard_id,
tenant_id,
TIMELINE_ID,
now_generation,
[(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
@@ -1142,7 +1173,7 @@ mod test {
drop(client);
ctx.restart().await;
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;
client.recover(HashMap::from([(tenant_id, now_generation)]))?;
info!("Flush-executing");
client.flush_execute().await?;
@@ -1206,7 +1237,7 @@ pub(crate) mod mock {
let mut objects = op.objects;
for (layer, meta) in op.layers {
objects.push(remote_layer_path(
&op.tenant_shard_id.tenant_id,
&op.tenant_id,
&op.timeline_id,
meta.shard,
&layer,
@@ -1290,34 +1321,4 @@ pub(crate) mod mock {
}
}
}
/// Test round-trip serialization/deserialization, and test stability of the format
/// vs. a static expected string for the serialized version.
#[test]
fn deletion_list_serialization() -> anyhow::Result<()> {
let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
.to_string()
.parse::<TenantShardId>()?;
let timeline_id = "be322c834ed9e709e63b5c9698691910"
.to_string()
.parse::<TimelineId>()?;
let generation = Generation::new(123);
let object =
RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
let mut objects = [object].to_vec();
let mut example = DeletionList::new(1);
example.push(&tenant_id, &timeline_id, generation, &mut objects);
let encoded = serde_json::to_string(&example)?;
let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
assert_eq!(encoded, expected);
let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
assert_eq!(example, decoded);
Ok(())
}
}

View File

@@ -19,7 +19,6 @@ use std::collections::HashMap;
use std::fs::create_dir_all;
use std::time::Duration;
use pageserver_api::shard::TenantShardId;
use regex::Regex;
use remote_storage::RemotePath;
use tokio_util::sync::CancellationToken;
@@ -27,6 +26,7 @@ use tracing::debug;
use tracing::info;
use tracing::warn;
use utils::generation::Generation;
use utils::id::TenantId;
use utils::id::TimelineId;
use crate::config::PageServerConf;
@@ -54,7 +54,7 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
#[derive(Debug)]
pub(super) struct DeletionOp {
pub(super) tenant_shard_id: TenantShardId,
pub(super) tenant_id: TenantId,
pub(super) timeline_id: TimelineId,
// `layers` and `objects` are both just lists of objects. `layers` is used if you do not
// have a config object handy to project it to a remote key, and need the consuming worker
@@ -62,14 +62,14 @@ pub(super) struct DeletionOp {
pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
pub(super) objects: Vec<RemotePath>,
/// The _current_ generation of the Tenant shard attachment in which we are enqueuing
/// The _current_ generation of the Tenant attachment in which we are enqueuing
/// this deletion.
pub(super) generation: Generation,
}
#[derive(Debug)]
pub(super) struct RecoverOp {
pub(super) attached_tenants: HashMap<TenantShardId, Generation>,
pub(super) attached_tenants: HashMap<TenantId, Generation>,
}
#[derive(Debug)]
@@ -206,7 +206,7 @@ impl ListWriter {
async fn recover(
&mut self,
attached_tenants: HashMap<TenantShardId, Generation>,
attached_tenants: HashMap<TenantId, Generation>,
) -> Result<(), anyhow::Error> {
debug!(
"recovering with {} attached tenants",
@@ -309,8 +309,8 @@ impl ListWriter {
// generation was issued to another node in the interval while we restarted,
// then we may treat deletion lists from the previous generation as if they
// belong to our currently attached generation, and proceed to validate & execute.
for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
for (tenant_id, tenant_list) in &mut deletion_list.tenants {
if let Some(attached_gen) = attached_tenants.get(tenant_id) {
if attached_gen.previous() == tenant_list.generation {
tenant_list.generation = *attached_gen;
}
@@ -390,7 +390,7 @@ impl ListWriter {
let mut layer_paths = Vec::new();
for (layer, meta) in op.layers {
layer_paths.push(remote_layer_path(
&op.tenant_shard_id.tenant_id,
&op.tenant_id,
&op.timeline_id,
meta.shard,
&layer,
@@ -400,14 +400,14 @@ impl ListWriter {
layer_paths.extend(op.objects);
if !self.pending.push(
&op.tenant_shard_id,
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,
) {
self.flush().await;
let retry_succeeded = self.pending.push(
&op.tenant_shard_id,
&op.tenant_id,
&op.timeline_id,
op.generation,
&mut layer_paths,

View File

@@ -310,8 +310,8 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
.unwrap()
.as_micros(),
partition,
candidate.timeline.tenant_shard_id,
candidate.timeline.timeline_id,
desc.tenant_id,
desc.timeline_id,
candidate.layer,
);
}
@@ -380,7 +380,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
for (timeline, batch) in batched {
let tenant_shard_id = timeline.tenant_shard_id;
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
let batch_size =
u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
@@ -431,7 +431,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
(evicted_bytes, evictions_failed)
}
}
.instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));
.instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
js.spawn(evict);
@@ -572,7 +572,7 @@ async fn collect_eviction_candidates(
continue;
}
let info = tl.get_local_layers_for_disk_usage_eviction().await;
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
tenant_candidates.extend(
info.resident_layers
.into_iter()

View File

@@ -624,99 +624,6 @@ paths:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/location_config:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
- name: flush_ms
in: query
required: false
schema:
type: integer
put:
description: |
Configures a _tenant location_, that is how a particular pageserver handles
a particular tenant. This includes _attached_ tenants, i.e. those ingesting WAL
and page service requests, and _secondary_ tenants, i.e. those which are just keeping
a warm cache in anticipation of transitioning to attached state in the future.
This is a declarative, idempotent API: there are not separate endpoints
for different tenant location configurations. Rather, this single endpoint accepts
a description of the desired location configuration, and makes whatever changes
are required to reach that state.
In imperative terms, this API is used to attach and detach tenants, and
to transition tenants to and from secondary mode.
This is a synchronous API: there is no 202 response. State transitions should always
be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case
the caller controls the runtime of the request.
In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions
to AttachedStale and Detached. Flushing is never necessary for correctness, but is an
important optimization when doing migrations. The `flush_ms` parameter controls whether
flushing should be attempted, and how much time is allowed for flushing. If the time limit expires,
the requested transition will continue without waiting for any outstanding data to flush. Callers
should use a duration which is substantially less than their HTTP client's request
timeout. It is safe to supply flush_ms irrespective of the request body: in state transitions
where flushing doesn't make sense, the server will ignore it.
It is safe to retry requests, but if one receives a 409 or 503 response, it is not
useful to retry aggressively: there is probably an existing request still ongoing.
requestBody:
required: false
content:
application/json:
schema:
$ref: "#/components/schemas/TenantLocationConfigRequest"
responses:
"200":
description: Tenant is now in requested state
"503":
description: Tenant's state cannot be changed right now. Wait a few seconds and retry.
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"409":
description: |
The tenant is already known to Pageserver in some way,
and hence this `/attach` call has been rejected.
Some examples of how this can happen:
- tenant was created on this pageserver
- tenant attachment was started by an earlier call to `/attach`.
Callers should poll the tenant status's `attachment_status` field,
like for status 202. See the longer description for `POST /attach`
for details.
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/detach:
parameters:
- name: tenant_id
@@ -1028,9 +935,6 @@ paths:
format: hex
pg_version:
type: integer
existing_initdb_timeline_id:
type: string
format: hex
responses:
"201":
description: TimelineInfo
@@ -1370,31 +1274,6 @@ components:
tenant_id:
type: string
format: hex
TenantLocationConfigRequest:
type: object
required:
- tenant_id
properties:
tenant_id:
type: string
format: hex
mode:
type: string
enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
description: Mode of functionality that this pageserver will run in for this tenant.
generation:
type: integer
description: Attachment generation number, mandatory when `mode` is an attached state
secondary_conf:
$ref: '#/components/schemas/SecondaryConfig'
tenant_conf:
$ref: '#/components/schemas/TenantConfig'
SecondaryConfig:
type: object
properties:
warm:
type: boolean
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
TenantConfig:
type: object
properties:

View File

@@ -4,7 +4,6 @@
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, Context, Result};
use enumset::EnumSet;
@@ -338,8 +337,13 @@ async fn build_timeline_info_common(
Lsn(0) => None,
lsn @ Lsn(_) => Some(lsn),
};
let current_logical_size =
timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
let current_logical_size = match timeline.get_current_logical_size(ctx) {
Ok((size, _)) => Some(size),
Err(err) => {
error!("Timeline info creation failed to get current logical size: {err:?}");
None
}
};
let current_physical_size = Some(timeline.layer_size_sum().await);
let state = timeline.current_state();
let remote_consistent_lsn_projected = timeline
@@ -352,8 +356,7 @@ async fn build_timeline_info_common(
let walreceiver_status = timeline.walreceiver_status();
let info = TimelineInfo {
// TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
tenant_id: timeline.tenant_shard_id.tenant_id,
tenant_id: timeline.tenant_id,
timeline_id: timeline.timeline_id,
ancestor_timeline_id,
ancestor_lsn,
@@ -363,11 +366,7 @@ async fn build_timeline_info_common(
last_record_lsn,
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
current_logical_size_is_accurate: match current_logical_size.accuracy() {
tenant::timeline::logical_size::Accuracy::Approximate => false,
tenant::timeline::logical_size::Accuracy::Exact => true,
},
current_logical_size,
current_physical_size,
current_logical_size_non_incremental: None,
timeline_dir_layer_file_size_sum: None,
@@ -440,7 +439,6 @@ async fn timeline_create_handler(
request_data.ancestor_timeline_id.map(TimelineId::from),
request_data.ancestor_start_lsn,
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
request_data.existing_initdb_timeline_id,
state.broker_client.clone(),
&ctx,
)
@@ -709,26 +707,6 @@ async fn tenant_detach_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_reset_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let state = get_state(&request);
state
.tenant_manager
.reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn tenant_load_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
@@ -844,7 +822,7 @@ async fn tenant_delete_handler(
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
.instrument(info_span!("tenant_delete_handler",
tenant_id = %tenant_shard_id.tenant_id,
shard = %tenant_shard_id.shard_slug()
shard = tenant_shard_id.shard_slug()
))
.await?;
@@ -1179,7 +1157,6 @@ async fn put_tenant_location_config_handler(
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1193,7 +1170,7 @@ async fn put_tenant_location_config_handler(
mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
.instrument(info_span!("tenant_detach",
tenant_id = %tenant_shard_id.tenant_id,
shard = %tenant_shard_id.shard_slug()
shard = tenant_shard_id.shard_slug()
))
.await
{
@@ -1212,7 +1189,7 @@ async fn put_tenant_location_config_handler(
state
.tenant_manager
.upsert_location(tenant_shard_id, location_conf, flush, &ctx)
.upsert_location(tenant_shard_id, location_conf, &ctx)
.await
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
// principle we might have hit something like concurrent API calls to the same tenant,
@@ -1848,9 +1825,6 @@ pub fn make_router(
.post("/v1/tenant/:tenant_id/detach", |r| {
api_handler(r, tenant_detach_handler)
})
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
api_handler(r, tenant_reset_handler)
})
.post("/v1/tenant/:tenant_id/load", |r| {
api_handler(r, tenant_load_handler)
})

View File

@@ -7,13 +7,12 @@ use std::pin::Pin;
use std::task::{self, Poll};
use anyhow::{bail, ensure, Context, Result};
use async_compression::tokio::bufread::ZstdDecoder;
use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
use bytes::Bytes;
use camino::Utf8Path;
use futures::StreamExt;
use nix::NixPath;
use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
use tokio_tar::Archive;
use tokio_tar::Builder;
use tokio_tar::HeaderMode;
@@ -733,13 +732,3 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
}
Ok(compressed.buf)
}
pub async fn extract_tar_zst(
pgdata_path: &Utf8Path,
tar_zst: impl AsyncBufRead + Unpin,
) -> Result<()> {
let tar = Box::pin(ZstdDecoder::new(tar_zst));
let mut archive = Archive::new(tar);
archive.unpack(pgdata_path).await?;
Ok(())
}

View File

@@ -186,6 +186,13 @@ pub struct InitializationOrder {
/// Each initial tenant load task carries this until completion.
pub initial_tenant_load: Option<utils::completion::Completion>,
/// Barrier for when we can start initial logical size calculations.
pub initial_logical_size_can_start: utils::completion::Barrier,
/// Each timeline owns a clone of this to be consumed on the initial logical size calculation
/// attempt. It is important to drop this once the attempt has completed.
pub initial_logical_size_attempt: Option<utils::completion::Completion>,
/// Barrier for when we can start any background jobs.
///
/// This can be broken up later on, but right now there is just one class of a background job.
@@ -205,7 +212,7 @@ async fn timed<Fut: std::future::Future>(
match tokio::time::timeout(warn_at, &mut fut).await {
Ok(ret) => {
tracing::info!(
stage = name,
task = name,
elapsed_ms = started.elapsed().as_millis(),
"completed"
);
@@ -213,7 +220,7 @@ async fn timed<Fut: std::future::Future>(
}
Err(_) => {
tracing::info!(
stage = name,
task = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
@@ -222,7 +229,7 @@ async fn timed<Fut: std::future::Future>(
// this has a global allowed_errors
tracing::warn!(
stage = name,
task = name,
elapsed_ms = started.elapsed().as_millis(),
"completed, took longer than expected"
);

View File

@@ -7,7 +7,6 @@ use metrics::{
HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
};
use once_cell::sync::Lazy;
use pageserver_api::shard::TenantShardId;
use strum::{EnumCount, IntoEnumIterator, VariantNames};
use strum_macros::{EnumVariantNames, IntoStaticStr};
use utils::id::{TenantId, TimelineId};
@@ -403,129 +402,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define current logical size metric")
});
pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
pub(crate) struct StartCalculation(IntCounterVec);
pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
StartCalculation(
register_int_counter_vec!(
"pageserver_initial_logical_size_start_calculation",
"Incremented each time we start an initial logical size calculation attempt. \
The `circumstances` label provides some additional details.",
&["attempt", "circumstances"]
)
.unwrap(),
)
});
struct DropCalculation {
first: IntCounter,
retry: IntCounter,
}
static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
let vec = register_int_counter_vec!(
"pageserver_initial_logical_size_drop_calculation",
"Incremented each time we abort a started size calculation attmpt.",
&["attempt"]
)
.unwrap();
DropCalculation {
first: vec.with_label_values(&["first"]),
retry: vec.with_label_values(&["retry"]),
}
});
pub(crate) struct Calculated {
pub(crate) births: IntCounter,
pub(crate) deaths: IntCounter,
}
pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
births: register_int_counter!(
"pageserver_initial_logical_size_finish_calculation",
"Incremented every time we finish calculation of initial logical size.\
If everything is working well, this should happen at most once per Timeline object."
)
.unwrap(),
deaths: register_int_counter!(
"pageserver_initial_logical_size_drop_finished_calculation",
"Incremented when we drop a finished initial logical size calculation result.\
Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
)
.unwrap(),
});
pub(crate) struct OngoingCalculationGuard {
inc_drop_calculation: Option<IntCounter>,
}
#[derive(strum_macros::IntoStaticStr)]
pub(crate) enum StartCircumstances {
EmptyInitial,
SkippedConcurrencyLimiter,
AfterBackgroundTasksRateLimit,
}
impl StartCalculation {
pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
let circumstances_label: &'static str = circumstances.into();
self.0.with_label_values(&["first", circumstances_label]);
OngoingCalculationGuard {
inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
}
}
pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
let circumstances_label: &'static str = circumstances.into();
self.0.with_label_values(&["retry", circumstances_label]);
OngoingCalculationGuard {
inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
}
}
}
impl Drop for OngoingCalculationGuard {
fn drop(&mut self) {
if let Some(counter) = self.inc_drop_calculation.take() {
counter.inc();
}
}
}
impl OngoingCalculationGuard {
pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
drop(self.inc_drop_calculation.take());
CALCULATED.births.inc();
FinishedCalculationGuard {
inc_on_drop: CALCULATED.deaths.clone(),
}
}
}
pub(crate) struct FinishedCalculationGuard {
inc_on_drop: IntCounter,
}
impl Drop for FinishedCalculationGuard {
fn drop(&mut self) {
self.inc_on_drop.inc();
}
}
// context: https://github.com/neondatabase/neon/issues/5963
pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
Lazy::new(|| {
register_int_counter!(
"pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
"Counter for the following event: walreceiver calls\
Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
)
.unwrap()
});
}
pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_tenant_states_count",
@@ -1376,20 +1252,9 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});
pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_process_launch_duration",
"Histogram of the duration of successful WalRedoProcess::launch calls",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
});
pub(crate) struct WalRedoProcessCounters {
pub(crate) started: IntCounter,
pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
pub(crate) active_stderr_logger_tasks_started: IntCounter,
pub(crate) active_stderr_logger_tasks_finished: IntCounter,
}
#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1413,19 +1278,6 @@ impl Default for WalRedoProcessCounters {
&["cause"],
)
.unwrap();
let active_stderr_logger_tasks_started = register_int_counter!(
"pageserver_walredo_stderr_logger_tasks_started_total",
"Number of active walredo stderr logger tasks that have started",
)
.unwrap();
let active_stderr_logger_tasks_finished = register_int_counter!(
"pageserver_walredo_stderr_logger_tasks_finished_total",
"Number of active walredo stderr logger tasks that have finished",
)
.unwrap();
Self {
started,
killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1433,8 +1285,6 @@ impl Default for WalRedoProcessCounters {
let cause_str: &'static str = cause.into();
killed.with_label_values(&[cause_str])
})),
active_stderr_logger_tasks_started,
active_stderr_logger_tasks_finished,
}
}
}
@@ -1721,9 +1571,9 @@ pub struct RemoteTimelineClientMetrics {
}
impl RemoteTimelineClientMetrics {
pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
RemoteTimelineClientMetrics {
tenant_id: tenant_shard_id.tenant_id.to_string(),
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
calls_unfinished_gauge: Mutex::new(HashMap::default()),
bytes_started_counter: Mutex::new(HashMap::default()),
@@ -2094,8 +1944,6 @@ pub fn preinitialize_metrics() {
// Tenant manager stats
Lazy::force(&TENANT_MANAGER);
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()
@@ -2113,7 +1961,6 @@ pub fn preinitialize_metrics() {
&WAL_REDO_TIME,
&WAL_REDO_RECORDS_HISTOGRAM,
&WAL_REDO_BYTES_HISTOGRAM,
&WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
]
.into_iter()
.for_each(|h| {

View File

@@ -53,23 +53,21 @@ use crate::context::{DownloadBehavior, RequestContext};
use crate::import_datadir::import_wal_from_tar;
use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::pgdatadir_mapping::rel_block_to_key;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::mgr;
use crate::tenant::mgr::get_active_tenant_with_timeout;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::mgr::ShardSelector;
use crate::tenant::Timeline;
use crate::trace::Tracer;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// is not yet in state [`TenantState::Active`].
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
/// Read the end of a tar archive.
///
@@ -401,25 +399,18 @@ impl PageServerHandler {
{
debug_assert_current_span_has_tenant_and_timeline_id();
// Note that since one connection may contain getpage requests that target different
// shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
// that we look up here may not be the one that serves all the actual requests: we will double
// check the mapping of key->shard later before calling into Timeline for getpage requests.
// Make request tracer if needed
let tenant = mgr::get_active_tenant_with_timeout(
tenant_id,
ShardSelector::First,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
// Make request tracer if needed
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path =
tenant
.conf
.trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
let path = tenant
.conf
.trace_path(&tenant_id, &timeline_id, &connection_id);
Some(Tracer::new(path))
} else {
None
@@ -571,7 +562,6 @@ impl PageServerHandler {
info!("creating new timeline");
let tenant = get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
@@ -634,7 +624,7 @@ impl PageServerHandler {
debug_assert_current_span_has_tenant_and_timeline_id();
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
@@ -813,49 +803,9 @@ impl PageServerHandler {
}
*/
let key = rel_block_to_key(req.rel, req.blkno);
let page = if timeline.get_shard_identity().is_key_local(&key) {
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
} else {
// The Tenant shard we looked up at connection start does not hold this particular
// key: look for other shards in this tenant. This scenario occurs if a pageserver
// has multiple shards for the same tenant.
//
// TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
let timeline = match self
.get_active_tenant_timeline(
timeline.tenant_shard_id.tenant_id,
timeline.timeline_id,
ShardSelector::Page(key),
)
.await
{
Ok(t) => t,
Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
// We already know this tenant exists in general, because we resolved it at
// start of connection. Getting a NotFound here indicates that the shard containing
// the requested page is not present on this node.
// TODO: this should be some kind of structured error that the client will understand,
// so that it can block until its config is updated: this error is expected in the case
// that the Tenant's shards' placements are being updated and the client hasn't been
// informed yet.
//
// https://github.com/neondatabase/neon/issues/6038
return Err(anyhow::anyhow!("Request routed to wrong shard"));
}
Err(e) => return Err(e.into()),
};
// Take a GateGuard for the duration of this request. If we were using our main Timeline object,
// the GateGuard was already held over the whole connection.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?
};
let page = timeline
.get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
.await?;
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
page,
@@ -884,7 +834,7 @@ impl PageServerHandler {
// check that the timeline exists
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {
@@ -990,11 +940,9 @@ impl PageServerHandler {
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
selector: ShardSelector,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(
tenant_id,
selector,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
@@ -1168,7 +1116,7 @@ where
self.check_permission(Some(tenant_id))?;
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1355,7 +1303,6 @@ where
let tenant = get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)

View File

@@ -13,7 +13,6 @@ use crate::repository::*;
use crate::walrecord::NeonWalRecord;
use anyhow::Context;
use bytes::{Buf, Bytes};
use pageserver_api::key::is_rel_block_key;
use pageserver_api::reltag::{RelTag, SlruKind};
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
use postgres_ffi::BLCKSZ;
@@ -283,10 +282,6 @@ impl Timeline {
}
/// Get a list of all existing relations in given tablespace and database.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn list_rels(
&self,
spcnode: Oid,
@@ -635,10 +630,6 @@ impl Timeline {
///
/// Only relation blocks are counted currently. That excludes metadata,
/// SLRUs, twophase files etc.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
pub async fn get_current_logical_size_non_incremental(
&self,
lsn: Lsn,
@@ -1323,7 +1314,7 @@ impl<'a> DatadirModification<'a> {
// Flush relation and SLRU data blocks, keep metadata.
let mut retained_pending_updates = HashMap::new();
for (key, value) in self.pending_updates.drain() {
if is_rel_block_key(&key) || is_slru_block_key(key) {
if is_rel_block_key(key) || is_slru_block_key(key) {
// This bails out on first error without modifying pending_updates.
// That's Ok, cf this function's doc comment.
writer.put(key, self.lsn, &value, ctx).await?;
@@ -1579,7 +1570,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
}
}
pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
Key {
field1: 0x00,
field2: rel.spcnode,
@@ -1778,6 +1769,10 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
})
}
fn is_rel_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}
pub fn is_rel_fsm_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
}

File diff suppressed because it is too large Load Diff

View File

@@ -8,12 +8,10 @@
//! We cannot use global or default config instead, because wrong settings
//! may lead to a data loss.
//!
use anyhow::bail;
use anyhow::Context;
use pageserver_api::models;
use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
use serde::de::IntoDeserializer;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::num::NonZeroU64;
use std::time::Duration;
use utils::generation::Generation;
@@ -523,49 +521,105 @@ impl Default for TenantConf {
}
}
// Helper function to standardize the error messages we produce on bad durations
//
// Intended to be used with anyhow's `with_context`, e.g.:
//
// let value = result.with_context(bad_duration("name", &value))?;
//
fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
move || format!("Cannot parse `{field_name}` duration {value:?}")
}
impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
// Convert the request_data to a JSON Value
let json_value: Value = serde_json::to_value(request_data)?;
let mut tenant_conf = TenantConfOpt::default();
// Create a Deserializer from the JSON Value
let deserializer = json_value.into_deserializer();
if let Some(gc_period) = &request_data.gc_period {
tenant_conf.gc_period = Some(
humantime::parse_duration(gc_period)
.with_context(bad_duration("gc_period", gc_period))?,
);
}
tenant_conf.gc_horizon = request_data.gc_horizon;
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
// Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt
let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?;
if let Some(pitr_interval) = &request_data.pitr_interval {
tenant_conf.pitr_interval = Some(
humantime::parse_duration(pitr_interval)
.with_context(bad_duration("pitr_interval", pitr_interval))?,
);
}
if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
tenant_conf.walreceiver_connect_timeout = Some(
humantime::parse_duration(walreceiver_connect_timeout).with_context(
bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
)?,
);
}
if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
tenant_conf.lagging_wal_timeout = Some(
humantime::parse_duration(lagging_wal_timeout)
.with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
);
}
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
tenant_conf.checkpoint_timeout = Some(
humantime::parse_duration(checkpoint_timeout)
.with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
);
}
tenant_conf.compaction_target_size = request_data.compaction_target_size;
tenant_conf.compaction_threshold = request_data.compaction_threshold;
if let Some(compaction_period) = &request_data.compaction_period {
tenant_conf.compaction_period = Some(
humantime::parse_duration(compaction_period)
.with_context(bad_duration("compaction_period", compaction_period))?,
);
}
if let Some(eviction_policy) = &request_data.eviction_policy {
tenant_conf.eviction_policy = Some(
serde::Deserialize::deserialize(eviction_policy)
.context("parse field `eviction_policy`")?,
);
}
tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
if let Some(evictions_low_residence_duration_metric_threshold) =
&request_data.evictions_low_residence_duration_metric_threshold
{
tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
.with_context(bad_duration(
"evictions_low_residence_duration_metric_threshold",
evictions_low_residence_duration_metric_threshold,
))?,
);
}
tenant_conf.gc_feedback = request_data.gc_feedback;
Ok(tenant_conf)
}
}
impl TryFrom<toml_edit::Item> for TenantConfOpt {
type Error = anyhow::Error;
fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
match item {
toml_edit::Item::Value(value) => {
let d = value.into_deserializer();
return serde_path_to_error::deserialize(d)
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}
toml_edit::Item::Table(table) => {
let deserializer = toml_edit::de::Deserializer::new(table.into());
return serde_path_to_error::deserialize(deserializer)
.map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
}
_ => {
bail!("expected non-inline table but found {item}")
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use models::TenantConfig;
#[test]
fn de_serializing_pageserver_config_omits_empty_values() {
@@ -582,38 +636,4 @@ mod tests {
assert_eq!(json_form, "{\"gc_horizon\":42}");
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
}
#[test]
fn test_try_from_models_tenant_config_err() {
let tenant_config = models::TenantConfig {
lagging_wal_timeout: Some("5a".to_string()),
..TenantConfig::default()
};
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
assert!(
tenant_conf_opt.is_err(),
"Suceeded to convert TenantConfig to TenantConfOpt"
);
let expected_error_str =
"lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
}
#[test]
fn test_try_from_models_tenant_config_success() {
let tenant_config = models::TenantConfig {
lagging_wal_timeout: Some("5s".to_string()),
..TenantConfig::default()
};
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap();
assert_eq!(
tenant_conf_opt.lagging_wal_timeout,
Some(Duration::from_secs(5))
);
}
}

View File

@@ -2,19 +2,22 @@ use std::sync::Arc;
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::{models::TenantState, shard::TenantShardId};
use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, instrument, Instrument, Span};
use tracing::{error, instrument, warn, Instrument, Span};
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
use utils::{
backoff, completion, crashsafe, fs_ext,
id::{TenantId, TimelineId},
};
use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self, TaskKind},
tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
InitializationOrder,
};
use super::{
@@ -56,10 +59,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
fn remote_tenant_delete_mark_path(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> anyhow::Result<RemotePath> {
let tenant_remote_path = conf
.tenant_path(tenant_shard_id)
.tenant_path(tenant_id)
.strip_prefix(&conf.workdir)
.context("Failed to strip workdir prefix")
.and_then(RemotePath::new)
@@ -70,9 +73,9 @@ fn remote_tenant_delete_mark_path(
async fn create_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
let data: &[u8] = &[];
backoff::retry(
@@ -96,9 +99,9 @@ async fn create_remote_delete_mark(
async fn create_local_delete_mark(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
// Note: we're ok to replace existing file.
let _ = std::fs::OpenOptions::new()
@@ -167,10 +170,10 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
async fn remove_tenant_remote_delete_mark(
conf: &PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
if let Some(remote_storage) = remote_storage {
let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
backoff::retry(
|| async { remote_storage.delete(&path).await },
|_e| false,
@@ -189,7 +192,7 @@ async fn remove_tenant_remote_delete_mark(
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
async fn cleanup_remaining_fs_traces(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
if is_dir {
@@ -201,8 +204,8 @@ async fn cleanup_remaining_fs_traces(
.with_context(|| format!("failed to delete {p}"))
};
rm(conf.tenant_config_path(tenant_shard_id), false).await?;
rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
rm(conf.tenant_config_path(tenant_id), false).await?;
rm(conf.tenant_location_config_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
Err(anyhow::anyhow!(
@@ -210,7 +213,7 @@ async fn cleanup_remaining_fs_traces(
))?
});
rm(conf.timelines_path(tenant_shard_id), true).await?;
rm(conf.timelines_path(tenant_id), true).await?;
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
Err(anyhow::anyhow!(
@@ -224,14 +227,14 @@ async fn cleanup_remaining_fs_traces(
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let tenant_path = &conf.tenant_path(tenant_shard_id);
let tenant_path = &conf.tenant_path(tenant_id);
if tenant_path.exists() {
crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
.await
.context("fsync_pre_mark_remove")?;
}
rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
Err(anyhow::anyhow!(
@@ -239,7 +242,7 @@ async fn cleanup_remaining_fs_traces(
))?
});
rm(conf.tenant_path(tenant_shard_id), true).await?;
rm(conf.tenant_path(tenant_id), true).await?;
Ok(())
}
@@ -284,8 +287,6 @@ impl DeleteTenantFlow {
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
pausable_failpoint!("tenant-delete-before-run");
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
@@ -320,7 +321,7 @@ impl DeleteTenantFlow {
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
.await
.context("remote_mark")?
}
@@ -331,7 +332,7 @@ impl DeleteTenantFlow {
))?
});
create_local_delete_mark(conf, &tenant.tenant_shard_id)
create_local_delete_mark(conf, &tenant.tenant_id)
.await
.context("local delete mark")?;
@@ -373,11 +374,9 @@ impl DeleteTenantFlow {
return Ok(acquire(tenant));
}
let tenant_id = tenant.tenant_id;
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
if conf
.tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
.exists()
{
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
Ok(acquire(tenant))
} else {
Ok(None)
@@ -389,6 +388,7 @@ impl DeleteTenantFlow {
tenant: &Arc<Tenant>,
preload: Option<TenantPreload>,
tenants: &'static std::sync::RwLock<TenantsMap>,
init_order: Option<InitializationOrder>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
let (_, progress) = completion::channel();
@@ -398,7 +398,10 @@ impl DeleteTenantFlow {
.await
.expect("cant be stopping or broken");
tenant.attach(preload, ctx).await.context("attach")?;
tenant
.attach(init_order, preload, ctx)
.await
.context("attach")?;
Self::background(
guard,
@@ -456,12 +459,12 @@ impl DeleteTenantFlow {
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
let tenant_shard_id = tenant.tenant_shard_id;
let tenant_id = tenant.tenant_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id.tenant_id),
Some(tenant_id),
None,
"tenant_delete",
false,
@@ -475,7 +478,7 @@ impl DeleteTenantFlow {
Ok(())
}
.instrument({
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
span.follows_from(Span::current());
span
}),
@@ -513,7 +516,7 @@ impl DeleteTenantFlow {
}
}
let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
let timelines_path = conf.timelines_path(&tenant.tenant_id);
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
if timelines_path.exists() {
// sanity check to guard against layout changes
@@ -522,8 +525,7 @@ impl DeleteTenantFlow {
.context("timelines dir not empty")?;
}
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
.await?;
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
Err(anyhow::anyhow!(
@@ -531,73 +533,21 @@ impl DeleteTenantFlow {
))?
});
cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
.await
.context("cleanup_remaining_fs_traces")?;
{
pausable_failpoint!("tenant-delete-before-map-remove");
let mut locked = tenants.write().unwrap();
if locked.remove(&tenant.tenant_id).is_none() {
warn!("Tenant got removed from tenants map during deletion");
};
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
//
// This complexity will go away when we simplify how deletion works:
// https://github.com/neondatabase/neon/issues/5080
loop {
// Under the TenantMap lock, try to remove the tenant. We usually succeed, but if
// we encounter an InProgress marker, yield the barrier it contains and wait on it.
let barrier = {
let mut locked = tenants.write().unwrap();
let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
match removed {
TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
match tenant.current_state() {
TenantState::Stopping { .. } | TenantState::Broken { .. } => {
// Expected: we put the tenant into stopping state before we start deleting it
}
state => {
// Unexpected state
tracing::warn!(
"Tenant in unexpected state {state} after deletion"
);
}
}
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
// This is unexpected: this secondary tenants should not have been created, and we
// are not in a position to shut it down from here.
tracing::warn!("Tenant transitioned to secondary mode while deleting!");
break;
}
TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
}
TenantsMapRemoveResult::Vacant => {
tracing::warn!(
"Tenant removed from TenantsMap before deletion completed"
);
break;
}
TenantsMapRemoveResult::InProgress(barrier) => {
// An InProgress entry was found, we must wait on its barrier
barrier
}
}
};
tracing::info!(
"Waiting for competing operation to complete before deleting state for tenant"
);
barrier.wait().await;
}
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
}
*guard = Self::Finished;

View File

@@ -7,19 +7,18 @@ use crate::page_cache::{self, PAGE_SZ};
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
use crate::virtual_file::VirtualFile;
use camino::Utf8PathBuf;
use pageserver_api::shard::TenantShardId;
use std::cmp::min;
use std::fs::OpenOptions;
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::sync::atomic::AtomicU64;
use tracing::*;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
pub struct EphemeralFile {
page_cache_file_id: page_cache::FileId,
_tenant_shard_id: TenantShardId,
_tenant_id: TenantId,
_timeline_id: TimelineId,
file: VirtualFile,
len: u64,
@@ -32,7 +31,7 @@ pub struct EphemeralFile {
impl EphemeralFile {
pub async fn create(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<EphemeralFile, io::Error> {
static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -40,7 +39,7 @@ impl EphemeralFile {
NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
let filename = conf
.timeline_path(&tenant_shard_id, &timeline_id)
.timeline_path(&tenant_id, &timeline_id)
.join(Utf8PathBuf::from(format!(
"ephemeral-{filename_disambiguator}"
)));
@@ -53,7 +52,7 @@ impl EphemeralFile {
Ok(EphemeralFile {
page_cache_file_id: page_cache::next_file_id(),
_tenant_shard_id: tenant_shard_id,
_tenant_id: tenant_id,
_timeline_id: timeline_id,
file,
len: 0,
@@ -283,7 +282,7 @@ mod tests {
) -> Result<
(
&'static PageServerConf,
TenantShardId,
TenantId,
TimelineId,
RequestContext,
),
@@ -296,13 +295,13 @@ mod tests {
// OK in a test.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap();
let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
Ok((conf, tenant_shard_id, timeline_id, ctx))
Ok((conf, tenant_id, timeline_id, ctx))
}
#[tokio::test]

View File

@@ -11,12 +11,15 @@
use std::io::{self};
use anyhow::{ensure, Context};
use pageserver_api::shard::TenantShardId;
use serde::{de::Error, Deserialize, Serialize, Serializer};
use thiserror::Error;
use utils::bin_ser::SerializeError;
use utils::crashsafe::path_with_suffix_extension;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::config::PageServerConf;
use crate::virtual_file::VirtualFile;
@@ -269,14 +272,14 @@ impl Serialize for TimelineMetadata {
}
/// Save timeline metadata to file
#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
pub async fn save_metadata(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
data: &TimelineMetadata,
) -> anyhow::Result<()> {
let path = conf.metadata_path(tenant_shard_id, timeline_id);
let path = conf.metadata_path(tenant_id, timeline_id);
let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
let metadata_bytes = data.to_bytes().context("serialize metadata")?;
VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
@@ -296,10 +299,10 @@ pub enum LoadMetadataError {
pub fn load_metadata(
conf: &'static PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
) -> Result<TimelineMetadata, LoadMetadataError> {
let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
let metadata_path = conf.metadata_path(tenant_id, timeline_id);
let metadata_bytes = std::fs::read(metadata_path)?;
Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)

View File

@@ -2,8 +2,7 @@
//! page server.
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use pageserver_api::key::Key;
use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap};
@@ -30,9 +29,7 @@ use crate::control_plane_client::{
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::TENANT_MANAGER as METRICS;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
};
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
@@ -125,24 +122,6 @@ fn exactly_one_or_none<'a>(
}
}
pub(crate) enum TenantsMapRemoveResult {
Occupied(TenantSlot),
Vacant,
InProgress(utils::completion::Barrier),
}
/// When resolving a TenantId to a shard, we may be looking for the 0th
/// shard, or we might be looking for whichever shard holds a particular page.
pub(crate) enum ShardSelector {
/// Only return the 0th shard, if it is present. If a non-0th shard is present,
/// ignore it.
Zero,
/// Pick the first shard we find for the TenantId
First,
/// Pick the shard that holds this key
Page(Key),
}
impl TenantsMap {
/// Convenience function for typical usage, where we want to get a `Tenant` object, for
/// working with attached tenants. If the TenantId is in the map but in Secondary state,
@@ -157,71 +136,12 @@ impl TenantsMap {
}
}
/// A page service client sends a TenantId, and to look up the correct Tenant we must
/// resolve this to a fully qualified TenantShardId.
fn resolve_shard(
&self,
tenant_id: &TenantId,
selector: ShardSelector,
) -> Option<TenantShardId> {
let mut want_shard = None;
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
match selector {
ShardSelector::First => return Some(*slot.0),
ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
return Some(*slot.0)
}
ShardSelector::Page(key) => {
if let Some(tenant) = slot.1.get_attached() {
// First slot we see for this tenant, calculate the expected shard number
// for the key: we will use this for checking if this and subsequent
// slots contain the key, rather than recalculating the hash each time.
if want_shard.is_none() {
want_shard = Some(tenant.shard_identity.get_shard_number(&key));
}
if Some(tenant.shard_identity.number) == want_shard {
return Some(*slot.0);
}
} else {
continue;
}
}
_ => continue,
}
}
// Fall through: we didn't find an acceptable shard
None
}
}
}
/// Only for use from DeleteTenantFlow. This method directly removes a TenantSlot from the map.
///
/// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
/// slot if the enclosed tenant is shutdown.
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
use std::collections::btree_map::Entry;
match self {
TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
match key {
Some(key) => match m.entry(key) {
Entry::Occupied(entry) => match entry.get() {
TenantSlot::InProgress(barrier) => {
TenantsMapRemoveResult::InProgress(barrier.clone())
}
_ => TenantsMapRemoveResult::Occupied(entry.remove()),
},
Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
},
None => TenantsMapRemoveResult::Vacant,
}
key.and_then(|key| m.remove(&key))
}
}
}
@@ -270,6 +190,49 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
/// Create a directory, including parents. This does no fsyncs and makes
/// no guarantees about the persistence of the resulting metadata: for
/// use when creating dirs for use as cache.
async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
let mut dirs_to_create = Vec::new();
let mut path: &Utf8Path = path.as_ref();
// Figure out which directories we need to create.
loop {
let meta = tokio::fs::metadata(path).await;
match meta {
Ok(metadata) if metadata.is_dir() => break,
Ok(_) => {
return Err(std::io::Error::new(
std::io::ErrorKind::AlreadyExists,
format!("non-directory found in path: {path}"),
));
}
Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => return Err(e),
}
dirs_to_create.push(path);
match path.parent() {
Some(parent) => path = parent,
None => {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidInput,
format!("can't find parent of path '{path}'"),
));
}
}
}
// Create directories from parent to child.
for &path in dirs_to_create.iter().rev() {
tokio::fs::create_dir(path).await?;
}
Ok(())
}
/// The TenantManager is responsible for storing and mutating the collection of all tenants
/// that this pageserver process has state for. Every Tenant and SecondaryTenant instance
/// lives inside the TenantManager.
@@ -287,8 +250,8 @@ pub struct TenantManager {
}
fn emergency_generations(
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
) -> HashMap<TenantShardId, Generation> {
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
) -> HashMap<TenantId, Generation> {
tenant_confs
.iter()
.filter_map(|(tid, lc)| {
@@ -308,10 +271,10 @@ fn emergency_generations(
async fn init_load_generations(
conf: &'static PageServerConf,
tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
resources: &TenantSharedResources,
cancel: &CancellationToken,
) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
let generations = if conf.control_plane_emergency_mode {
error!(
"Emergency mode! Tenants will be attached unsafely using their last known generation"
@@ -354,7 +317,7 @@ async fn init_load_generations(
fn load_tenant_config(
conf: &'static PageServerConf,
dentry: Utf8DirEntry,
) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
) -> anyhow::Result<Option<(TenantId, anyhow::Result<LocationConf>)>> {
let tenant_dir_path = dentry.path().to_path_buf();
if crate::is_temporary(&tenant_dir_path) {
info!("Found temporary tenant directory, removing: {tenant_dir_path}");
@@ -390,10 +353,10 @@ fn load_tenant_config(
return Ok(None);
}
let tenant_shard_id = match tenant_dir_path
let tenant_id = match tenant_dir_path
.file_name()
.unwrap_or_default()
.parse::<TenantShardId>()
.parse::<TenantId>()
{
Ok(id) => id,
Err(_) => {
@@ -403,8 +366,8 @@ fn load_tenant_config(
};
Ok(Some((
tenant_shard_id,
Tenant::load_tenant_config(conf, &tenant_shard_id),
tenant_id,
Tenant::load_tenant_config(conf, &tenant_id),
)))
}
@@ -415,7 +378,7 @@ fn load_tenant_config(
/// seconds even on reasonably fast drives.
async fn init_load_tenant_configs(
conf: &'static PageServerConf,
) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
let tenants_dir = conf.tenants_path();
let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
@@ -465,19 +428,19 @@ pub async fn init_tenant_mgr(
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
// Construct `Tenant` objects and start them running
for (tenant_shard_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
for (tenant_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_id);
let mut location_conf = match location_conf {
Ok(l) => l,
Err(e) => {
warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}");
warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
tenants.insert(
tenant_shard_id,
TenantShardId::unsharded(tenant_id),
TenantSlot::Attached(Tenant::create_broken_tenant(
conf,
tenant_shard_id,
tenant_id,
format!("{}", e),
)),
);
@@ -488,7 +451,7 @@ pub async fn init_tenant_mgr(
let generation = if let Some(generations) = &tenant_generations {
// We have a generation map: treat it as the authority for whether
// this tenant is really attached.
if let Some(gen) = generations.get(&tenant_shard_id) {
if let Some(gen) = generations.get(&tenant_id) {
*gen
} else {
match &location_conf.mode {
@@ -496,8 +459,8 @@ pub async fn init_tenant_mgr(
// We do not require the control plane's permission for secondary mode
// tenants, because they do no remote writes and hence require no
// generation number
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
tenants.insert(tenant_shard_id, TenantSlot::Secondary);
info!(%tenant_id, "Loaded tenant in secondary mode");
tenants.insert(TenantShardId::unsharded(tenant_id), TenantSlot::Secondary);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
@@ -505,9 +468,9 @@ pub async fn init_tenant_mgr(
// away local state, we can gracefully fall back to secondary here, if the control
// plane tells us so.
// (https://github.com/neondatabase/neon/issues/5377)
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
error!(%tenant_id,
"Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
);
}
@@ -519,23 +482,21 @@ pub async fn init_tenant_mgr(
} else {
// Legacy mode: no generation information, any tenant present
// on local disk may activate
info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
Generation::none()
};
// Presence of a generation number implies attachment: attach the tenant
// if it wasn't already, and apply the generation number.
location_conf.attach_in_generation(generation);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
let shard_identity = location_conf.shard;
match tenant_spawn(
conf,
tenant_shard_id,
tenant_id,
&tenant_dir_path,
resources.clone(),
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
Some(init_order.clone()),
&TENANTS,
SpawnMode::Normal,
@@ -548,7 +509,7 @@ pub async fn init_tenant_mgr(
);
}
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
error!(%tenant_id, "Failed to start tenant: {e:#}");
}
}
}
@@ -572,11 +533,10 @@ pub async fn init_tenant_mgr(
#[allow(clippy::too_many_arguments)]
pub(crate) fn tenant_spawn(
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
tenant_path: &Utf8Path,
resources: TenantSharedResources,
location_conf: AttachedTenantConf,
shard_identity: ShardIdentity,
init_order: Option<InitializationOrder>,
tenants: &'static std::sync::RwLock<TenantsMap>,
mode: SpawnMode,
@@ -597,25 +557,18 @@ pub(crate) fn tenant_spawn(
"Cannot load tenant from empty directory {tenant_path:?}"
);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
anyhow::ensure!(
!conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
!conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
);
info!(
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
generation = ?location_conf.location.generation,
attach_mode = ?location_conf.location.attach_mode,
"Attaching tenant"
);
info!("Attaching tenant {tenant_id}");
let tenant = match Tenant::spawn(
conf,
tenant_shard_id,
tenant_id,
resources,
location_conf,
shard_identity,
init_order,
tenants,
mode,
@@ -623,8 +576,8 @@ pub(crate) fn tenant_spawn(
) {
Ok(tenant) => tenant,
Err(e) => {
error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
}
};
@@ -779,20 +732,19 @@ pub(crate) async fn create_tenant(
ctx: &RequestContext,
) -> Result<Arc<Tenant>, TenantMapInsertError> {
let location_conf = LocationConf::attached_single(tenant_conf, generation);
info!("Creating tenant at location {location_conf:?}");
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
// TODO(sharding): make local paths shard-aware
let tenant_path =
super::create_tenant_files(conf, &location_conf, &tenant_shard_id.tenant_id).await?;
let shard_identity = location_conf.shard;
let created_tenant = tenant_spawn(
conf,
tenant_shard_id,
tenant_shard_id.tenant_id,
&tenant_path,
resources,
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
None,
&TENANTS,
SpawnMode::Create,
@@ -829,9 +781,8 @@ pub(crate) async fn set_new_tenant_config(
// API to use is the location_config/ endpoint, which lets the caller provide
// the full LocationConf.
let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf)
.await
.map_err(SetNewTenantConfigError::Persist)?;
tenant.set_new_tenant_config(new_tenant_conf);
@@ -885,12 +836,10 @@ impl TenantManager {
Ok(())
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(crate) async fn upsert_location(
&self,
tenant_shard_id: TenantShardId,
new_location_config: LocationConf,
flush: Option<Duration>,
ctx: &RequestContext,
) -> Result<(), anyhow::Error> {
debug_assert_current_span_has_tenant_id();
@@ -899,7 +848,7 @@ impl TenantManager {
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
// then we do not need to set the slot to InProgress, we can just call into the
// existng tenant.
let modify_tenant = {
{
let locked = self.tenants.read().unwrap();
let peek_slot =
tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -910,50 +859,22 @@ impl TenantManager {
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(AttachedTenantConf::try_from(
new_location_config.clone(),
new_location_config,
)?);
Some(tenant.clone())
// Persist the new config in the background, to avoid holding up any
// locks while we do so.
// TODO
return Ok(());
} else {
// Different generations, fall through to general case
None
}
}
_ => {
// Not an Attached->Attached transition, fall through to general case
None
}
}
};
// Fast-path continued: having dropped out of the self.tenants lock, do the async
// phase of waiting for flush, before returning.
if let Some(tenant) = modify_tenant {
// Transition to AttachedStale means we may well hold a valid generation
// still, and have been requested to go stale as part of a migration. If
// the caller set `flush`, then flush to remote storage.
if let LocationMode::Attached(AttachedLocationConfig {
generation: _,
attach_mode: AttachmentMode::Stale,
}) = &new_location_config.mode
{
if let Some(flush_timeout) = flush {
match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
Ok(Err(e)) => {
return Err(e);
}
Ok(Ok(_)) => return Ok(()),
Err(_) => {
tracing::warn!(
timeout_ms = flush_timeout.as_millis(),
"Timed out waiting for flush to remote storage, proceeding anyway."
)
}
}
}
}
return Ok(());
}
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
@@ -992,44 +913,55 @@ impl TenantManager {
slot_guard.drop_old_value().expect("We just shut it down");
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
// TODO(sharding): make local paths sharding-aware
let tenant_path = self.conf.tenant_path(&tenant_shard_id.tenant_id);
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => {
// Directory doesn't need to be fsync'd because if we crash it can
// safely be recreated next time this tenant location is configured.
tokio::fs::create_dir_all(&tenant_path)
unsafe_create_dir_all(&tenant_path)
.await
.with_context(|| format!("Creating {tenant_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
// TODO(sharding): make local paths sharding-aware
Tenant::persist_tenant_config(
self.conf,
&tenant_shard_id.tenant_id,
&new_location_config,
)
.await
.map_err(SetNewTenantConfigError::Persist)?;
TenantSlot::Secondary
}
LocationMode::Attached(_attach_config) => {
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
// TODO(sharding): make local paths sharding-aware
let timelines_path = self.conf.timelines_path(&tenant_shard_id.tenant_id);
// Directory doesn't need to be fsync'd because we do not depend on
// it to exist after crashes: it may be recreated when tenant is
// re-attached, see https://github.com/neondatabase/neon/issues/5550
tokio::fs::create_dir_all(&tenant_path)
unsafe_create_dir_all(&timelines_path)
.await
.with_context(|| format!("Creating {timelines_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
// TODO(sharding): make local paths sharding-aware
Tenant::persist_tenant_config(
self.conf,
&tenant_shard_id.tenant_id,
&new_location_config,
)
.await
.map_err(SetNewTenantConfigError::Persist)?;
let shard_identity = new_location_config.shard;
// TODO(sharding): make spawn sharding-aware
let tenant = tenant_spawn(
self.conf,
tenant_shard_id,
tenant_shard_id.tenant_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(new_location_config)?,
shard_identity,
None,
self.tenants,
SpawnMode::Normal,
@@ -1044,81 +976,6 @@ impl TenantManager {
Ok(())
}
/// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
/// LocationConf that was last used to attach it. Optionally, the local file cache may be
/// dropped before re-attaching.
///
/// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
/// where an issue is identified that would go away with a restart of the tenant.
///
/// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
/// to respect the cancellation tokens used in normal shutdown().
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
pub(crate) async fn reset_tenant(
&self,
tenant_shard_id: TenantShardId,
drop_cache: bool,
ctx: RequestContext,
) -> anyhow::Result<()> {
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
let Some(old_slot) = slot_guard.get_old_value() else {
anyhow::bail!("Tenant not found when trying to reset");
};
let Some(tenant) = old_slot.get_attached() else {
slot_guard.revert();
anyhow::bail!("Tenant is not in attached state");
};
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, false).await {
Ok(()) => {
slot_guard.drop_old_value()?;
}
Err(_barrier) => {
slot_guard.revert();
anyhow::bail!("Cannot reset Tenant, already shutting down");
}
}
let tenant_path = self.conf.tenant_path(&tenant_shard_id);
let timelines_path = self.conf.timelines_path(&tenant_shard_id);
let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
if drop_cache {
tracing::info!("Dropping local file cache");
match tokio::fs::read_dir(&timelines_path).await {
Err(e) => {
tracing::warn!("Failed to list timelines while dropping cache: {}", e);
}
Ok(mut entries) => {
while let Some(entry) = entries.next_entry().await? {
tokio::fs::remove_dir_all(entry.path()).await?;
}
}
}
}
let shard_identity = config.shard;
let tenant = tenant_spawn(
self.conf,
tenant_shard_id,
&tenant_path,
self.resources.clone(),
AttachedTenantConf::try_from(config)?,
shard_identity,
None,
self.tenants,
SpawnMode::Normal,
&ctx,
)?;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
Ok(())
}
}
#[derive(Debug, thiserror::Error)]
@@ -1203,7 +1060,6 @@ pub(crate) enum GetActiveTenantError {
/// then wait for up to `timeout` (minus however long we waited for the slot).
pub(crate) async fn get_active_tenant_with_timeout(
tenant_id: TenantId,
shard_selector: ShardSelector,
timeout: Duration,
cancel: &CancellationToken,
) -> Result<Arc<Tenant>, GetActiveTenantError> {
@@ -1212,17 +1068,15 @@ pub(crate) async fn get_active_tenant_with_timeout(
Tenant(Arc<Tenant>),
}
// TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
// to decide which shard services the request)
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let wait_start = Instant::now();
let deadline = wait_start + timeout;
let (wait_for, tenant_shard_id) = {
let wait_for = {
let locked = TENANTS.read().unwrap();
// Resolve TenantId to TenantShardId
let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
)?;
let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
.map_err(GetTenantError::MapState)?;
match peek_slot {
@@ -1232,7 +1086,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
// Fast path: we don't need to do any async waiting.
return Ok(tenant.clone());
}
_ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
_ => WaitFor::Tenant(tenant.clone()),
}
}
Some(TenantSlot::Secondary) => {
@@ -1240,9 +1094,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
tenant_id,
)))
}
Some(TenantSlot::InProgress(barrier)) => {
(WaitFor::Barrier(barrier.clone()), tenant_shard_id)
}
Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
None => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
tenant_id,
@@ -1327,7 +1179,8 @@ pub(crate) async fn delete_tenant(
// See https://github.com/neondatabase/neon/issues/5080
// TODO(sharding): make delete API sharding-aware
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
let mut slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
// unwrap is safe because we used MustExist mode when acquiring
let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1407,7 +1260,8 @@ async fn detach_tenant0(
deletion_queue_client: &DeletionQueueClient,
) -> Result<Utf8PathBuf, TenantStateError> {
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
// TODO(sharding): make local path helpers shard-aware
let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean.tenant_id);
safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
@@ -1432,7 +1286,8 @@ async fn detach_tenant0(
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
)
{
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
// TODO(sharding): make local paths sharding-aware
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id.tenant_id);
if tenant_ignore_mark.exists() {
info!("Detaching an ignored tenant");
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
@@ -1461,9 +1316,9 @@ pub(crate) async fn load_tenant(
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let tenant_path = conf.tenant_path(&tenant_shard_id);
let tenant_path = conf.tenant_path(&tenant_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
if tenant_ignore_mark.exists() {
std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
format!(
@@ -1479,19 +1334,17 @@ pub(crate) async fn load_tenant(
};
let mut location_conf =
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
Tenant::load_tenant_config(conf, &tenant_id).map_err(TenantMapInsertError::Other)?;
location_conf.attach_in_generation(generation);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
let shard_identity = location_conf.shard;
let new_tenant = tenant_spawn(
conf,
tenant_shard_id,
tenant_id,
&tenant_path,
resources,
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
None,
&TENANTS,
SpawnMode::Normal,
@@ -1519,7 +1372,7 @@ async fn ignore_tenant0(
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
remove_tenant_from_memory(tenants, tenant_shard_id, async {
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
fs::File::create(&ignore_mark_file)
.await
.context("Failed to create ignore mark file")
@@ -1577,18 +1430,16 @@ pub(crate) async fn attach_tenant(
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let location_conf = LocationConf::attached_single(tenant_conf, generation);
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let shard_identity = location_conf.shard;
let attached_tenant = tenant_spawn(
conf,
tenant_shard_id,
tenant_id,
&tenant_dir,
resources,
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
None,
&TENANTS,
SpawnMode::Normal,
@@ -1654,10 +1505,9 @@ pub enum TenantSlotUpsertError {
MapState(#[from] TenantMapError),
}
#[derive(Debug, thiserror::Error)]
#[derive(Debug)]
enum TenantSlotDropError {
/// It is only legal to drop a TenantSlot if its contents are fully shut down
#[error("Tenant was not shut down")]
NotShutdown,
}
@@ -1717,9 +1567,9 @@ impl SlotGuard {
}
}
/// Get any value that was present in the slot before we acquired ownership
/// Take any value that was present in the slot before we acquired ownership
/// of it: in state transitions, this will be the old state.
fn get_old_value(&self) -> &Option<TenantSlot> {
fn get_old_value(&mut self) -> &Option<TenantSlot> {
&self.old_value
}
@@ -1937,7 +1787,7 @@ fn tenant_map_acquire_slot_impl(
METRICS.tenant_slot_writes.inc();
let mut locked = tenants.write().unwrap();
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
let _guard = span.enter();
let m = match &mut *locked {
@@ -2102,9 +1952,6 @@ pub(crate) async fn immediate_gc(
.with_context(|| format!("tenant {tenant_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?;
// TODO(sharding): make callers of this function shard-aware
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
@@ -2126,7 +1973,7 @@ pub(crate) async fn immediate_gc(
#[allow(unused_mut)]
let mut result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
.instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
.instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
.await;
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.

View File

@@ -188,8 +188,7 @@ use anyhow::Context;
use camino::Utf8Path;
use chrono::{NaiveDateTime, Utc};
pub(crate) use download::download_initdb_tar_zst;
use pageserver_api::shard::{ShardIndex, TenantShardId};
use pageserver_api::shard::ShardIndex;
use scopeguard::ScopeGuard;
use tokio_util::sync::CancellationToken;
pub(crate) use upload::upload_initdb_dir;
@@ -302,7 +301,7 @@ pub struct RemoteTimelineClient {
runtime: tokio::runtime::Handle,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
generation: Generation,
@@ -326,7 +325,7 @@ impl RemoteTimelineClient {
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
generation: Generation,
) -> RemoteTimelineClient {
@@ -338,16 +337,13 @@ impl RemoteTimelineClient {
} else {
BACKGROUND_RUNTIME.handle().clone()
},
tenant_shard_id,
tenant_id,
timeline_id,
generation,
storage_impl: remote_storage,
deletion_queue_client,
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&tenant_shard_id,
&timeline_id,
)),
metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
}
}
@@ -407,6 +403,11 @@ impl RemoteTimelineClient {
Ok(())
}
pub(crate) fn get_shard_index(&self) -> ShardIndex {
// TODO: carry this on the struct
ShardIndex::unsharded()
}
pub fn remote_consistent_lsn_projected(&self) -> Option<Lsn> {
match &mut *self.upload_queue.lock().unwrap() {
UploadQueue::Uninitialized => None,
@@ -468,13 +469,14 @@ impl RemoteTimelineClient {
let index_part = download::download_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
self.generation,
cancel,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Download,
@@ -510,13 +512,13 @@ impl RemoteTimelineClient {
download::download_layer_file(
self.conf,
&self.storage_impl,
self.tenant_shard_id,
self.tenant_id,
self.timeline_id,
layer_file_name,
layer_metadata,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Download,
@@ -964,8 +966,9 @@ impl RemoteTimelineClient {
|| {
upload::upload_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
self.generation,
&index_part_with_deleted_at,
)
@@ -1022,7 +1025,7 @@ impl RemoteTimelineClient {
.drain()
.map(|(file_name, meta)| {
remote_layer_path(
&self.tenant_shard_id.tenant_id,
&self.tenant_id,
&self.timeline_id,
meta.shard,
&file_name,
@@ -1037,7 +1040,7 @@ impl RemoteTimelineClient {
// Do not delete index part yet, it is needed for possible retry. If we remove it first
// and retry will arrive to different pageserver there wont be any traces of it on remote storage
let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
// Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
// taking the burden of listing all the layers that we already know we should delete.
@@ -1073,22 +1076,17 @@ impl RemoteTimelineClient {
.unwrap_or(
// No generation-suffixed indices, assume we are dealing with
// a legacy index.
remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()),
remote_index_path(
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
Generation::none(),
),
);
let remaining_layers: Vec<RemotePath> = remaining
.into_iter()
.filter(|p| {
if p == &latest_index {
return false;
}
if let Some(name) = p.object_name() {
if name == INITDB_PATH {
return false;
}
}
true
})
.filter(|p| p!= &latest_index)
.inspect(|path| {
if let Some(name) = path.object_name() {
info!(%name, "deleting a file not referenced from index_part.json");
@@ -1215,12 +1213,12 @@ impl RemoteTimelineClient {
// Spawn task to perform the task
let self_rc = Arc::clone(self);
let tenant_shard_id = self.tenant_shard_id;
let tenant_id = self.tenant_id;
let timeline_id = self.timeline_id;
task_mgr::spawn(
&self.runtime,
TaskKind::RemoteUploadTask,
Some(self.tenant_shard_id.tenant_id),
Some(self.tenant_id),
Some(self.timeline_id),
"remote upload",
false,
@@ -1228,7 +1226,7 @@ impl RemoteTimelineClient {
self_rc.perform_upload_task(task).await;
Ok(())
}
.instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)),
.instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
);
// Loop back to process next task
@@ -1271,17 +1269,16 @@ impl RemoteTimelineClient {
let upload_result: anyhow::Result<()> = match &task.op {
UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
let path = layer.local_path_from_id(&self.tenant_shard_id, &self.timeline_id);
let path = layer.local_path();
upload::upload_timeline_layer(
self.conf,
&self.storage_impl,
&path,
path,
layer_metadata,
self.generation,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Layer,
RemoteOpKind::Upload,
@@ -1301,13 +1298,14 @@ impl RemoteTimelineClient {
let res = upload::upload_index_part(
&self.storage_impl,
&self.tenant_shard_id,
&self.tenant_id,
&self.timeline_id,
self.get_shard_index(),
self.generation,
index_part,
)
.measure_remote_op(
self.tenant_shard_id.tenant_id,
self.tenant_id,
self.timeline_id,
RemoteOpFileKind::Index,
RemoteOpKind::Upload,
@@ -1327,7 +1325,7 @@ impl RemoteTimelineClient {
pausable_failpoint!("before-delete-layer-pausable");
self.deletion_queue_client
.push_layers(
self.tenant_shard_id,
self.tenant_id,
self.timeline_id,
self.generation,
delete.layers.clone(),
@@ -1446,7 +1444,7 @@ impl RemoteTimelineClient {
// data safety guarantees (see docs/rfcs/025-generation-numbers.md)
self.deletion_queue_client
.update_remote_consistent_lsn(
self.tenant_shard_id,
self.tenant_id,
self.timeline_id,
self.generation,
lsn,
@@ -1604,21 +1602,15 @@ impl RemoteTimelineClient {
}
}
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
RemotePath::from_string(&path).expect("Failed to construct path")
}
pub fn remote_timeline_path(
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> RemotePath {
remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
}
/// Note that the shard component of a remote layer path is _not_ always the same
/// as in the TenantShardId of the caller: tenants may reference layers from a different
/// ShardIndex. Use the ShardIndex from the layer's metadata.
pub fn remote_layer_path(
tenant_id: &TenantId,
timeline_id: &TimelineId,
@@ -1645,12 +1637,14 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
}
pub fn remote_index_path(
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
generation: Generation,
) -> RemotePath {
RemotePath::from_string(&format!(
"tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
"tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
shard.get_suffix(),
IndexPart::FILE_NAME,
generation.get_suffix()
))
@@ -1792,14 +1786,14 @@ mod tests {
Arc::new(RemoteTimelineClient {
conf: self.harness.conf,
runtime: tokio::runtime::Handle::current(),
tenant_shard_id: self.harness.tenant_shard_id,
tenant_id: self.harness.tenant_id,
timeline_id: TIMELINE_ID,
generation,
storage_impl: self.harness.remote_storage.clone(),
deletion_queue_client: self.harness.deletion_queue.new_client(),
upload_queue: Mutex::new(UploadQueue::Uninitialized),
metrics: Arc::new(RemoteTimelineClientMetrics::new(
&self.harness.tenant_shard_id,
&self.harness.tenant_id,
&TIMELINE_ID,
)),
})
@@ -2106,7 +2100,11 @@ mod tests {
assert_eq!(actual_c, expected_c);
}
async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
async fn inject_index_part(
test_state: &TestSetup,
generation: Generation,
shard: ShardIndex,
) -> IndexPart {
// An empty IndexPart, just sufficient to ensure deserialization will succeed
let example_metadata = TimelineMetadata::example();
let example_index_part = IndexPart::new(
@@ -2128,8 +2126,9 @@ mod tests {
let index_path = test_state.harness.remote_fs_dir.join(
remote_index_path(
&test_state.harness.tenant_shard_id,
&test_state.harness.tenant_id,
&TIMELINE_ID,
shard,
generation,
)
.get_path(),
@@ -2169,7 +2168,12 @@ mod tests {
// Simple case: we are in generation N, load the index from generation N - 1
let generation_n = 5;
let injected = inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
let injected = inject_index_part(
&test_state,
Generation::new(generation_n - 1),
ShardIndex::unsharded(),
)
.await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected).await;
@@ -2187,22 +2191,34 @@ mod tests {
// A generation-less IndexPart exists in the bucket, we should find it
let generation_n = 5;
let injected_none = inject_index_part(&test_state, Generation::none()).await;
let injected_none =
inject_index_part(&test_state, Generation::none(), ShardIndex::unsharded()).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_none).await;
// If a more recent-than-none generation exists, we should prefer to load that
let injected_1 = inject_index_part(&test_state, Generation::new(1)).await;
let injected_1 =
inject_index_part(&test_state, Generation::new(1), ShardIndex::unsharded()).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
// If a more-recent-than-me generation exists, we should ignore it.
let _injected_10 = inject_index_part(&test_state, Generation::new(10)).await;
let _injected_10 =
inject_index_part(&test_state, Generation::new(10), ShardIndex::unsharded()).await;
assert_got_index_part(&test_state, Generation::new(generation_n), &injected_1).await;
// If a directly previous generation exists, _and_ an index exists in my own
// generation, I should prefer my own generation.
let _injected_prev =
inject_index_part(&test_state, Generation::new(generation_n - 1)).await;
let injected_current = inject_index_part(&test_state, Generation::new(generation_n)).await;
let _injected_prev = inject_index_part(
&test_state,
Generation::new(generation_n - 1),
ShardIndex::unsharded(),
)
.await;
let injected_current = inject_index_part(
&test_state,
Generation::new(generation_n),
ShardIndex::unsharded(),
)
.await;
assert_got_index_part(
&test_state,
Generation::new(generation_n),

View File

@@ -8,12 +8,11 @@ use std::future::Future;
use std::time::Duration;
use anyhow::{anyhow, Context};
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::shard::TenantShardId;
use tokio::fs::{self, File, OpenOptions};
use tokio::io::{AsyncSeekExt, AsyncWriteExt};
use camino::Utf8Path;
use pageserver_api::shard::ShardIndex;
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tokio_util::sync::CancellationToken;
use tracing::warn;
use utils::{backoff, crashsafe};
use crate::config::PageServerConf;
@@ -21,15 +20,14 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
use crate::tenant::storage_layer::LayerFileName;
use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::Generation;
use crate::TEMP_FILE_SUFFIX;
use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
use utils::crashsafe::path_with_suffix_extension;
use utils::id::TimelineId;
use utils::id::{TenantId, TimelineId};
use super::index::{IndexPart, LayerFileMetadata};
use super::{
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
};
static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
@@ -42,7 +40,7 @@ static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
pub async fn download_layer_file<'a>(
conf: &'static PageServerConf,
storage: &'a GenericRemoteStorage,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
layer_file_name: &'a LayerFileName,
layer_metadata: &'a LayerFileMetadata,
@@ -50,11 +48,11 @@ pub async fn download_layer_file<'a>(
debug_assert_current_span_has_tenant_and_timeline_id();
let local_path = conf
.timeline_path(&tenant_shard_id, &timeline_id)
.timeline_path(&tenant_id, &timeline_id)
.join(layer_file_name.file_name());
let remote_path = remote_layer_path(
&tenant_shard_id.tenant_id,
&tenant_id,
&timeline_id,
layer_metadata.shard,
layer_file_name,
@@ -173,10 +171,10 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
/// List timelines of given tenant in remote storage
pub async fn list_remote_timelines(
storage: &GenericRemoteStorage,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
cancel: CancellationToken,
) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
let remote_path = remote_timelines_path(&tenant_shard_id);
let remote_path = remote_timelines_path(&tenant_id);
fail::fail_point!("storage-sync-list-remote-timelines", |_| {
anyhow::bail!("storage-sync-list-remote-timelines");
@@ -184,7 +182,7 @@ pub async fn list_remote_timelines(
let listing = download_retry_forever(
|| storage.list(Some(&remote_path), ListingMode::WithDelimiter),
&format!("list timelines for {tenant_shard_id}"),
&format!("list timelines for {tenant_id}"),
cancel,
)
.await?;
@@ -194,7 +192,7 @@ pub async fn list_remote_timelines(
for timeline_remote_storage_key in listing.prefixes {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
match object_name.parse::<TimelineId>() {
@@ -215,12 +213,13 @@ pub async fn list_remote_timelines(
async fn do_download_index_part(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
index_generation: Generation,
cancel: CancellationToken,
) -> Result<IndexPart, DownloadError> {
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
let remote_path = remote_index_path(tenant_id, timeline_id, shard, index_generation);
let index_part_bytes = download_retry_forever(
|| async {
@@ -256,8 +255,9 @@ async fn do_download_index_part(
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
pub(super) async fn download_index_part(
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
my_generation: Generation,
cancel: CancellationToken,
) -> Result<IndexPart, DownloadError> {
@@ -267,8 +267,9 @@ pub(super) async fn download_index_part(
// Operating without generations: just fetch the generation-less path
return do_download_index_part(
storage,
tenant_shard_id,
tenant_id,
timeline_id,
shard,
my_generation,
cancel,
)
@@ -281,8 +282,9 @@ pub(super) async fn download_index_part(
// This is an optimization to avoid doing the listing for the general case below.
let res = do_download_index_part(
storage,
tenant_shard_id,
tenant_id,
timeline_id,
shard,
my_generation,
cancel.clone(),
)
@@ -308,8 +310,9 @@ pub(super) async fn download_index_part(
// This is an optimization to avoid doing the listing for the general case below.
let res = do_download_index_part(
storage,
tenant_shard_id,
tenant_id,
timeline_id,
shard,
my_generation.previous(),
cancel.clone(),
)
@@ -332,7 +335,7 @@ pub(super) async fn download_index_part(
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
// to constructing a full index path with no generation, because the generation is a suffix.
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
let index_prefix = remote_index_path(tenant_id, timeline_id, shard, Generation::none());
let indices = backoff::retry(
|| async { storage.list_files(Some(&index_prefix)).await },
|_| false,
@@ -358,16 +361,17 @@ pub(super) async fn download_index_part(
match max_previous_generation {
Some(g) => {
tracing::debug!("Found index_part in generation {g:?}");
do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
do_download_index_part(storage, tenant_id, timeline_id, shard, g, cancel).await
}
None => {
// Migration from legacy pre-generation state: we have a generation but no prior
// attached pageservers did. Try to load from a no-generation path.
tracing::debug!("No index_part.json* found");
tracing::info!("No index_part.json* found");
do_download_index_part(
storage,
tenant_shard_id,
tenant_id,
timeline_id,
shard,
Generation::none(),
cancel,
)
@@ -376,69 +380,6 @@ pub(super) async fn download_index_part(
}
}
pub(crate) async fn download_initdb_tar_zst(
conf: &'static PageServerConf,
storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Result<(Utf8PathBuf, File), DownloadError> {
debug_assert_current_span_has_tenant_and_timeline_id();
let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
let timeline_path = conf.timelines_path(tenant_shard_id);
if !timeline_path.exists() {
tokio::fs::create_dir_all(&timeline_path)
.await
.with_context(|| format!("timeline dir creation {timeline_path}"))
.map_err(DownloadError::Other)?;
}
let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
let file = download_retry(
|| async {
let mut file = OpenOptions::new()
.create(true)
.truncate(true)
.read(true)
.write(true)
.open(&temp_path)
.await
.with_context(|| format!("tempfile creation {temp_path}"))
.map_err(DownloadError::Other)?;
let mut download = storage.download(&remote_path).await?;
tokio::io::copy(&mut download.download_stream, &mut file)
.await
.with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
.map_err(DownloadError::Other)?;
file.seek(std::io::SeekFrom::Start(0))
.await
.with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
.map_err(DownloadError::Other)?;
Ok(file)
},
&format!("download {remote_path}"),
)
.await
.map_err(|e| {
if temp_path.exists() {
// Do a best-effort attempt at deleting the temporary file upon encountering an error.
// We don't have async here nor do we want to pile on any extra errors.
if let Err(e) = std::fs::remove_file(&temp_path) {
warn!("error deleting temporary file {temp_path}: {e}");
}
}
e
})?;
Ok((temp_path, file))
}
/// Helper function to handle retries for a download operation.
///
/// Remote operations can fail due to rate limits (IAM, S3), spurious network

View File

@@ -4,7 +4,7 @@ use anyhow::{bail, Context};
use bytes::Bytes;
use camino::Utf8Path;
use fail::fail_point;
use pageserver_api::shard::TenantShardId;
use pageserver_api::shard::ShardIndex;
use std::io::ErrorKind;
use tokio::fs;
@@ -25,8 +25,9 @@ use tracing::info;
/// Serializes and uploads the given index part data to the remote storage.
pub(super) async fn upload_index_part<'a>(
storage: &'a GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
shard: ShardIndex,
generation: Generation,
index_part: &'a IndexPart,
) -> anyhow::Result<()> {
@@ -43,11 +44,11 @@ pub(super) async fn upload_index_part<'a>(
let index_part_size = index_part_bytes.len();
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
let remote_path = remote_index_path(tenant_id, timeline_id, shard, generation);
storage
.upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
.await
.with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
.with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
}
/// Attempts to upload given layer files.

View File

@@ -2,9 +2,9 @@
pub mod delta_layer;
mod filename;
pub mod image_layer;
mod image_layer;
mod inmemory_layer;
pub(crate) mod layer;
mod layer;
mod layer_desc;
use crate::context::{AccessStatsBehavior, RequestContext};
@@ -24,7 +24,10 @@ use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
use utils::rate_limit::RateLimit;
use utils::lsn::Lsn;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
@@ -305,13 +308,25 @@ pub mod tests {
impl From<DeltaFileName> for PersistentLayerDesc {
fn from(value: DeltaFileName) -> Self {
PersistentLayerDesc::new_delta(value.key_range, value.lsn_range, 233)
PersistentLayerDesc::new_delta(
TenantId::from_array([0; 16]),
TimelineId::from_array([0; 16]),
value.key_range,
value.lsn_range,
233,
)
}
}
impl From<ImageFileName> for PersistentLayerDesc {
fn from(value: ImageFileName) -> Self {
PersistentLayerDesc::new_img(value.key_range, value.lsn, 233)
PersistentLayerDesc::new_img(
TenantId::from_array([0; 16]),
TimelineId::from_array([0; 16]),
value.key_range,
value.lsn,
233,
)
}
}

View File

@@ -42,7 +42,6 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
use anyhow::{bail, ensure, Context, Result};
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs::File;
@@ -70,13 +69,13 @@ use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct Summary {
/// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
pub magic: u16,
pub format_version: u16,
magic: u16,
format_version: u16,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
pub lsn_range: Range<Lsn>,
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
/// Block number where the 'index' part of the file begins.
pub index_start_blk: u32,
@@ -84,6 +83,17 @@ pub struct Summary {
pub index_root_blk: u32,
}
impl From<&DeltaLayer> for Summary {
fn from(layer: &DeltaLayer) -> Self {
Self::expected(
layer.desc.tenant_id,
layer.desc.timeline_id,
layer.desc.key_range.clone(),
layer.desc.lsn_range.clone(),
)
}
}
impl Summary {
pub(super) fn expected(
tenant_id: TenantId,
@@ -238,7 +248,7 @@ impl DeltaLayer {
fn temp_path_for(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
tenant_id: &TenantId,
timeline_id: &TimelineId,
key_start: Key,
lsn_range: &Range<Lsn>,
@@ -249,15 +259,14 @@ impl DeltaLayer {
.map(char::from)
.collect();
conf.timeline_path(tenant_shard_id, timeline_id)
.join(format!(
"{}-XXX__{:016X}-{:016X}.{}.{}",
key_start,
u64::from(lsn_range.start),
u64::from(lsn_range.end),
rand_string,
TEMP_FILE_SUFFIX,
))
conf.timeline_path(tenant_id, timeline_id).join(format!(
"{}-XXX__{:016X}-{:016X}.{}.{}",
key_start,
u64::from(lsn_range.start),
u64::from(lsn_range.end),
rand_string,
TEMP_FILE_SUFFIX,
))
}
///
@@ -312,6 +321,8 @@ impl DeltaLayer {
Ok(DeltaLayer {
path: path.to_path_buf(),
desc: PersistentLayerDesc::new_delta(
summary.tenant_id,
summary.timeline_id,
summary.key_range,
summary.lsn_range,
metadata.len(),
@@ -342,7 +353,7 @@ struct DeltaLayerWriterInner {
conf: &'static PageServerConf,
pub path: Utf8PathBuf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
key_start: Key,
lsn_range: Range<Lsn>,
@@ -359,7 +370,7 @@ impl DeltaLayerWriterInner {
async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
key_start: Key,
lsn_range: Range<Lsn>,
) -> anyhow::Result<Self> {
@@ -369,8 +380,7 @@ impl DeltaLayerWriterInner {
//
// Note: This overwrites any existing file. There shouldn't be any.
// FIXME: throw an error instead?
let path =
DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
let mut file = VirtualFile::create(&path).await?;
// make room for the header block
@@ -385,7 +395,7 @@ impl DeltaLayerWriterInner {
conf,
path,
timeline_id,
tenant_shard_id,
tenant_id,
key_start,
lsn_range,
tree: tree_builder,
@@ -447,7 +457,7 @@ impl DeltaLayerWriterInner {
let summary = Summary {
magic: DELTA_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id: self.tenant_shard_id.tenant_id,
tenant_id: self.tenant_id,
timeline_id: self.timeline_id,
key_range: self.key_start..key_end,
lsn_range: self.lsn_range.clone(),
@@ -488,6 +498,8 @@ impl DeltaLayerWriterInner {
// set inner.file here. The first read will have to re-open it.
let desc = PersistentLayerDesc::new_delta(
self.tenant_id,
self.timeline_id,
self.key_start..key_end,
self.lsn_range.clone(),
metadata.len(),
@@ -498,7 +510,7 @@ impl DeltaLayerWriterInner {
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
trace!("created delta layer {}", self.path);
trace!("created delta layer {}", layer.local_path());
Ok(layer)
}
@@ -537,20 +549,14 @@ impl DeltaLayerWriter {
pub async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
key_start: Key,
lsn_range: Range<Lsn>,
) -> anyhow::Result<Self> {
Ok(Self {
inner: Some(
DeltaLayerWriterInner::new(
conf,
timeline_id,
tenant_shard_id,
key_start,
lsn_range,
)
.await?,
DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
.await?,
),
})
}
@@ -605,61 +611,6 @@ impl Drop for DeltaLayerWriter {
}
}
#[derive(thiserror::Error, Debug)]
pub enum RewriteSummaryError {
#[error("magic mismatch")]
MagicMismatch,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl From<std::io::Error> for RewriteSummaryError {
fn from(e: std::io::Error) -> Self {
Self::Other(anyhow::anyhow!(e))
}
}
impl DeltaLayer {
pub async fn rewrite_summary<F>(
path: &Utf8Path,
rewrite: F,
ctx: &RequestContext,
) -> Result<(), RewriteSummaryError>
where
F: Fn(Summary) -> Summary,
{
let file = VirtualFile::open_with_options(
path,
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
let mut file = file.file;
if actual_summary.magic != DELTA_FILE_MAGIC {
return Err(RewriteSummaryError::MagicMismatch);
}
let new_summary = rewrite(actual_summary);
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
if buf.spilled() {
// The code in DeltaLayerWriterInner just warn!()s for this.
// It should probably error out as well.
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
"Used more than one page size for summary buffer: {}",
buf.len()
)));
}
file.seek(SeekFrom::Start(0)).await?;
file.write_all(&buf).await?;
Ok(())
}
}
impl DeltaLayerInner {
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
/// - inner has the success or transient failure

View File

@@ -41,7 +41,6 @@ use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::fs::File;
@@ -68,23 +67,34 @@ use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
/// the 'index' starts at the block indicated by 'index_start_blk'
///
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
pub struct Summary {
pub(super) struct Summary {
/// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
pub magic: u16,
pub format_version: u16,
magic: u16,
format_version: u16,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub key_range: Range<Key>,
pub lsn: Lsn,
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn: Lsn,
/// Block number where the 'index' part of the file begins.
pub index_start_blk: u32,
index_start_blk: u32,
/// Block within the 'index', where the B-tree root page is stored
pub index_root_blk: u32,
index_root_blk: u32,
// the 'values' part starts after the summary header, on block 1.
}
impl From<&ImageLayer> for Summary {
fn from(layer: &ImageLayer) -> Self {
Self::expected(
layer.desc.tenant_id,
layer.desc.timeline_id,
layer.desc.key_range.clone(),
layer.lsn,
)
}
}
impl Summary {
pub(super) fn expected(
tenant_id: TenantId,
@@ -207,7 +217,7 @@ impl ImageLayer {
fn temp_path_for(
conf: &PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
fname: &ImageFileName,
) -> Utf8PathBuf {
let rand_string: String = rand::thread_rng()
@@ -216,7 +226,7 @@ impl ImageLayer {
.map(char::from)
.collect();
conf.timeline_path(&tenant_shard_id, &timeline_id)
conf.timeline_path(&tenant_id, &timeline_id)
.join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
}
@@ -266,10 +276,15 @@ impl ImageLayer {
let metadata = file
.metadata()
.context("get file metadata to determine size")?;
Ok(ImageLayer {
path: path.to_path_buf(),
desc: PersistentLayerDesc::new_img(summary.key_range, summary.lsn, metadata.len()), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
desc: PersistentLayerDesc::new_img(
summary.tenant_id,
summary.timeline_id,
summary.key_range,
summary.lsn,
metadata.len(),
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: summary.lsn,
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
inner: OnceCell::new(),
@@ -281,61 +296,6 @@ impl ImageLayer {
}
}
#[derive(thiserror::Error, Debug)]
pub enum RewriteSummaryError {
#[error("magic mismatch")]
MagicMismatch,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
impl From<std::io::Error> for RewriteSummaryError {
fn from(e: std::io::Error) -> Self {
Self::Other(anyhow::anyhow!(e))
}
}
impl ImageLayer {
pub async fn rewrite_summary<F>(
path: &Utf8Path,
rewrite: F,
ctx: &RequestContext,
) -> Result<(), RewriteSummaryError>
where
F: Fn(Summary) -> Summary,
{
let file = VirtualFile::open_with_options(
path,
&*std::fs::OpenOptions::new().read(true).write(true),
)
.await
.with_context(|| format!("Failed to open file '{}'", path))?;
let file = FileBlockReader::new(file);
let summary_blk = file.read_blk(0, ctx).await?;
let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
let mut file = file.file;
if actual_summary.magic != IMAGE_FILE_MAGIC {
return Err(RewriteSummaryError::MagicMismatch);
}
let new_summary = rewrite(actual_summary);
let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
if buf.spilled() {
// The code in ImageLayerWriterInner just warn!()s for this.
// It should probably error out as well.
return Err(RewriteSummaryError::Other(anyhow::anyhow!(
"Used more than one page size for summary buffer: {}",
buf.len()
)));
}
file.seek(SeekFrom::Start(0)).await?;
file.write_all(&buf).await?;
Ok(())
}
}
impl ImageLayerInner {
/// Returns nested result following Result<Result<_, OpErr>, Critical>:
/// - inner has the success or transient failure
@@ -440,7 +400,7 @@ struct ImageLayerWriterInner {
conf: &'static PageServerConf,
path: Utf8PathBuf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
key_range: Range<Key>,
lsn: Lsn,
@@ -455,7 +415,7 @@ impl ImageLayerWriterInner {
async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
) -> anyhow::Result<Self> {
@@ -464,7 +424,7 @@ impl ImageLayerWriterInner {
let path = ImageLayer::temp_path_for(
conf,
timeline_id,
tenant_shard_id,
tenant_id,
&ImageFileName {
key_range: key_range.clone(),
lsn,
@@ -488,7 +448,7 @@ impl ImageLayerWriterInner {
conf,
path,
timeline_id,
tenant_shard_id,
tenant_id,
key_range: key_range.clone(),
lsn,
tree: tree_builder,
@@ -535,7 +495,7 @@ impl ImageLayerWriterInner {
let summary = Summary {
magic: IMAGE_FILE_MAGIC,
format_version: STORAGE_FORMAT_VERSION,
tenant_id: self.tenant_shard_id.tenant_id,
tenant_id: self.tenant_id,
timeline_id: self.timeline_id,
key_range: self.key_range.clone(),
lsn: self.lsn,
@@ -560,7 +520,13 @@ impl ImageLayerWriterInner {
.await
.context("get metadata to determine file size")?;
let desc = PersistentLayerDesc::new_img(self.key_range.clone(), self.lsn, metadata.len());
let desc = PersistentLayerDesc::new_img(
self.tenant_id,
self.timeline_id,
self.key_range.clone(),
self.lsn,
metadata.len(),
);
// Note: Because we open the file in write-only mode, we cannot
// reuse the same VirtualFile for reading later. That's why we don't
@@ -572,7 +538,7 @@ impl ImageLayerWriterInner {
// FIXME: why not carry the virtualfile here, it supports renaming?
let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;
trace!("created image layer {}", self.path);
trace!("created image layer {}", layer.local_path());
Ok(layer)
}
@@ -611,14 +577,13 @@ impl ImageLayerWriter {
pub async fn new(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
key_range: &Range<Key>,
lsn: Lsn,
) -> anyhow::Result<ImageLayerWriter> {
Ok(Self {
inner: Some(
ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
.await?,
ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
),
})
}

View File

@@ -14,11 +14,15 @@ use crate::tenant::Timeline;
use crate::walrecord;
use anyhow::{ensure, Result};
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::HashMap;
use std::sync::{Arc, OnceLock};
use tracing::*;
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
use utils::{
bin_ser::BeSer,
id::{TenantId, TimelineId},
lsn::Lsn,
vec_map::VecMap,
};
// avoid binding to Write (conflicts with std::io::Write)
// while being able to use std::fmt::Write's methods
use std::fmt::Write as _;
@@ -29,7 +33,7 @@ use super::{DeltaLayerWriter, ResidentLayer};
pub struct InMemoryLayer {
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
/// This layer contains all the changes from 'start_lsn'. The
@@ -222,17 +226,17 @@ impl InMemoryLayer {
pub async fn create(
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
start_lsn: Lsn,
) -> Result<InMemoryLayer> {
trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
Ok(InMemoryLayer {
conf,
timeline_id,
tenant_shard_id,
tenant_id,
start_lsn,
end_lsn: OnceLock::new(),
inner: RwLock::new(InMemoryLayerInner {
@@ -331,7 +335,7 @@ impl InMemoryLayer {
let mut delta_layer_writer = DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
self.tenant_id,
Key::MIN,
self.start_lsn..end_lsn,
)

View File

@@ -3,15 +3,13 @@ use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use pageserver_api::shard::{ShardIndex, TenantShardId};
use pageserver_api::shard::ShardIndex;
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
use std::time::SystemTime;
use tracing::Instrument;
use utils::id::TimelineId;
use utils::lsn::Lsn;
use utils::sync::gate::GateError;
use utils::sync::heavier_once_cell;
use crate::config::PageServerConf;
@@ -83,7 +81,12 @@ impl Layer {
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> Self {
let desc = PersistentLayerDesc::from_filename(file_name, metadata.file_size());
let desc = PersistentLayerDesc::from_filename(
timeline.tenant_id,
timeline.timeline_id,
file_name,
metadata.file_size(),
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
@@ -97,7 +100,7 @@ impl Layer {
metadata.shard,
)));
debug_assert!(owner.0.needs_download_blocking(timeline).unwrap().is_some());
debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
owner
}
@@ -109,7 +112,12 @@ impl Layer {
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> ResidentLayer {
let desc = PersistentLayerDesc::from_filename(file_name, metadata.file_size());
let desc = PersistentLayerDesc::from_filename(
timeline.tenant_id,
timeline.timeline_id,
file_name,
metadata.file_size(),
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
@@ -136,7 +144,7 @@ impl Layer {
let downloaded = resident.expect("just initialized");
debug_assert!(owner.0.needs_download_blocking(timeline).unwrap().is_none());
debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());
timeline
.metrics
@@ -181,7 +189,7 @@ impl Layer {
let downloaded = resident.expect("just initialized");
// if the rename works, the path is as expected
std::fs::rename(temp_path, owner.local_path(timeline))
std::fs::rename(temp_path, owner.local_path())
.with_context(|| format!("rename temporary file as correct path for {owner}"))?;
Ok(ResidentLayer { downloaded, owner })
@@ -214,18 +222,14 @@ impl Layer {
///
/// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
/// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
pub(crate) fn delete_on_drop(&self) {
self.0.delete_on_drop();
pub(crate) fn garbage_collect_on_drop(&self) {
self.0.garbage_collect_on_drop();
}
/// Return data needed to reconstruct given page at LSN.
///
/// It is up to the caller to collect more data from the previous layer and
/// perform WAL redo, if necessary.
///
/// # Cancellation-Safety
///
/// This method is cancellation-safe.
pub(crate) async fn get_value_reconstruct_data(
&self,
key: Key,
@@ -301,12 +305,8 @@ impl Layer {
&self.0.access_stats
}
fn local_path(&self, timeline: &Timeline) -> Utf8PathBuf {
self.0.local_path(timeline)
}
pub(crate) fn filename(&self) -> LayerFileName {
self.0.desc.filename()
pub(crate) fn local_path(&self) -> &Utf8Path {
&self.0.path
}
pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -327,10 +327,10 @@ impl Layer {
Ok(())
}
/// Waits until this layer has been dropped (and if needed, local file deletion and remote
/// Waits until this layer has been dropped (and if needed, local garbage collection and remote
/// deletion scheduling has completed).
///
/// Does not start local deletion, use [`Self::delete_on_drop`] for that
/// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
/// separatedly.
#[cfg(feature = "testing")]
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
@@ -398,9 +398,13 @@ impl ResidentOrWantedEvicted {
}
struct LayerInner {
/// Only needed to check ondemand_download_behavior_treat_error_as_warn and in [`Self::local_path_from_id`]
/// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
/// [`Self::path`].
conf: &'static PageServerConf,
/// Full path to the file; unclear if this should exist anymore.
path: Utf8PathBuf,
desc: PersistentLayerDesc,
/// Timeline access is needed for remote timeline client and metrics.
@@ -415,8 +419,8 @@ struct LayerInner {
/// Initialization and deinitialization are done while holding a permit.
inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
/// Do we want to delete locally and remotely this when `LayerInner` is dropped
wanted_deleted: AtomicBool,
/// Do we want to garbage collect this when `LayerInner` is dropped
wanted_garbage_collected: AtomicBool,
/// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
/// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
@@ -430,6 +434,10 @@ struct LayerInner {
version: AtomicUsize,
/// Allow subscribing to when the layer actually gets evicted.
///
/// If in future we need to implement "wait until layer instances are gone and done", carrying
/// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
/// method for "wait_gc" which will wait to this being closed.
status: tokio::sync::broadcast::Sender<Status>,
/// Counter for exponential backoff with the download
@@ -471,39 +479,19 @@ enum Status {
impl Drop for LayerInner {
fn drop(&mut self) {
if !*self.wanted_deleted.get_mut() {
if !*self.wanted_garbage_collected.get_mut() {
// should we try to evict if the last wish was for eviction?
// feels like there's some hazard of overcrowding near shutdown near by, but we don't
// run drops during shutdown (yet)
return;
}
// We will only do I/O on drop if our Timeline still exists. Otherwise, we may safely
// leave garbage layers behind to be cleaned up the next time this Timeline is instantiated.
let Some(timeline) = self.timeline.upgrade() else {
// no need to nag that timeline is gone: under normal situation on
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
return;
};
// We will only do I/O during drop if our Timeline's layer_gate is open: this avoids
// the risk that we would race with Timeline::shutdown and end up doing I/O to a timeline
// path for which the Timeline object has been torn down already.
let _gate_guard = match timeline.layer_gate.enter() {
Ok(g) => g,
Err(GateError::GateClosed) => {
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
return;
}
};
// If timeline is alive, we can construct a span with IDs for this function.
let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %timeline.tenant_shard_id.tenant_id, shard_id=%timeline.tenant_shard_id.shard_slug(), timeline_id = %timeline.timeline_id);
let path = self.local_path(&timeline);
let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id);
let path = std::mem::take(&mut self.path);
let file_name = self.layer_desc().filename();
let file_size = self.layer_desc().file_size;
let timeline = self.timeline.clone();
let meta = self.metadata();
let status = self.status.clone();
@@ -525,32 +513,38 @@ impl Drop for LayerInner {
false
}
Err(e) => {
tracing::error!("failed to remove wanted deleted layer: {e}");
LAYER_IMPL_METRICS.inc_delete_removes_failed();
tracing::error!("failed to remove garbage collected layer: {e}");
LAYER_IMPL_METRICS.inc_gc_removes_failed();
false
}
};
if removed {
timeline.metrics.resident_physical_size_sub(file_size);
}
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
}
LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
} else {
LAYER_IMPL_METRICS.inc_completed_deletes();
if let Some(timeline) = timeline.upgrade() {
if removed {
timeline.metrics.resident_physical_size_sub(file_size);
}
if let Some(remote_client) = timeline.remote_client.as_ref() {
let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
if let Err(e) = res {
// test_timeline_deletion_with_files_stuck_in_upload_queue is good at
// demonstrating this deadlock (without spawn_blocking): stop will drop
// queued items, which will have ResidentLayer's, and those drops would try
// to re-entrantly lock the RemoteTimelineClient inner state.
if !timeline.is_active() {
tracing::info!("scheduling deletion on drop failed: {e:#}");
} else {
tracing::warn!("scheduling deletion on drop failed: {e:#}");
}
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
} else {
LAYER_IMPL_METRICS.inc_completed_gcs();
}
}
} else {
// no need to nag that timeline is gone: under normal situation on
// task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
}
});
}
@@ -566,6 +560,10 @@ impl LayerInner {
generation: Generation,
shard: ShardIndex,
) -> Self {
let path = conf
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
.join(desc.filename().to_string());
let (inner, version) = if let Some(inner) = downloaded {
let version = inner.version;
let resident = ResidentOrWantedEvicted::Resident(inner);
@@ -576,11 +574,12 @@ impl LayerInner {
LayerInner {
conf,
path,
desc,
timeline: Arc::downgrade(timeline),
have_remote_client: timeline.remote_client.is_some(),
access_stats,
wanted_deleted: AtomicBool::new(false),
wanted_garbage_collected: AtomicBool::new(false),
wanted_evicted: AtomicBool::new(false),
inner,
version: AtomicUsize::new(version),
@@ -591,32 +590,16 @@ impl LayerInner {
}
}
/// All call sites that need this function should already have a Timeline (e.g. from
/// upgrading the Self::timeline weak pointer) -- it doesn't make sense to try and
/// do anything with the local file if the Timeline isn't still alive.
fn local_path(&self, timeline: &Timeline) -> Utf8PathBuf {
self.local_path_from_id(&timeline.tenant_shard_id, &timeline.timeline_id)
}
/// Use this instead of `local_path` if you don't have a Timeline but do have its ID: this
/// is used by external callers such as [`crate::tenant::RemoteTimelineClient`]
pub(crate) fn local_path_from_id(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Utf8PathBuf {
self.conf
.timeline_path(tenant_shard_id, timeline_id)
.join(self.desc.filename().to_string())
}
fn delete_on_drop(&self) {
let res =
self.wanted_deleted
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
fn garbage_collect_on_drop(&self) {
let res = self.wanted_garbage_collected.compare_exchange(
false,
true,
Ordering::Release,
Ordering::Relaxed,
);
if res.is_ok() {
LAYER_IMPL_METRICS.inc_started_deletes();
LAYER_IMPL_METRICS.inc_started_gcs();
}
}
@@ -684,10 +667,6 @@ impl LayerInner {
// disable any scheduled but not yet running eviction deletions for this
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
// count cancellations, which currently remain largely unexpected
let init_cancelled =
scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
// no need to make the evict_and_wait wait for the actual download to complete
drop(self.status.send(Status::Downloaded));
@@ -696,14 +675,12 @@ impl LayerInner {
.upgrade()
.ok_or_else(|| DownloadError::TimelineShutdown)?;
// FIXME: grab a gate
let can_ever_evict = timeline.remote_client.as_ref().is_some();
// check if we really need to be downloaded; could have been already downloaded by a
// cancelled previous attempt.
let needs_download = self
.needs_download(&timeline)
.needs_download()
.await
.map_err(DownloadError::PreStatFailed)?;
@@ -758,8 +735,6 @@ impl LayerInner {
tracing::info!(waiters, "completing the on-demand download for other tasks");
}
scopeguard::ScopeGuard::into_inner(init_cancelled);
Ok((ResidentOrWantedEvicted::Resident(res), permit))
};
@@ -853,13 +828,12 @@ impl LayerInner {
// block tenant::mgr::remove_tenant_from_memory.
let this: Arc<Self> = self.clone();
let timeline_clone = timeline.clone();
crate::task_mgr::spawn(
&tokio::runtime::Handle::current(),
crate::task_mgr::TaskKind::RemoteDownloadTask,
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
Some(self.desc.tenant_id),
Some(self.desc.timeline_id),
&task_name,
false,
async move {
@@ -889,13 +863,14 @@ impl LayerInner {
match res {
(Ok(()), _) => {
// our caller is cancellation safe so this is fine; if someone
// else requests the layer, they'll find it already downloaded.
// else requests the layer, they'll find it already downloaded
// or redownload.
//
// See counter [`LayerImplMetrics::inc_init_needed_no_download`]
//
// FIXME(#6028): however, could be that we should consider marking the
// layer for eviction? alas, cannot: because only DownloadedLayer will
// handle that.
// however, could be that we should consider marking the layer
// for eviction? alas, cannot: because only DownloadedLayer
// will handle that.
tracing::info!("layer file download completed after requester had cancelled");
LAYER_IMPL_METRICS.inc_download_completed_without_requester();
},
(Err(e), _) => {
// our caller is cancellation safe, but we might be racing with
@@ -915,7 +890,7 @@ impl LayerInner {
match rx.await {
Ok((Ok(()), permit)) => {
if let Some(reason) = self
.needs_download(&timeline_clone)
.needs_download()
.await
.map_err(DownloadError::PostStatFailed)?
{
@@ -950,26 +925,16 @@ impl LayerInner {
}
}
async fn needs_download(
&self,
timeline: &Timeline,
) -> Result<Option<NeedsDownload>, std::io::Error> {
let path = self.local_path(timeline);
match tokio::fs::metadata(path).await {
async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
match tokio::fs::metadata(&self.path).await {
Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
Err(e) => Err(e),
}
}
fn needs_download_blocking(
&self,
timeline: &Timeline,
) -> Result<Option<NeedsDownload>, std::io::Error> {
let path = self.local_path(timeline);
match path.metadata() {
fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
match self.path.metadata() {
Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
Err(e) => Err(e),
@@ -1025,20 +990,14 @@ impl LayerInner {
/// `DownloadedLayer` is being dropped, so it calls this method.
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
let delete = self.wanted_deleted.load(Ordering::Acquire);
let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
let evict = self.wanted_evicted.load(Ordering::Acquire);
let can_evict = self.have_remote_client;
if delete {
// do nothing now, only in LayerInner::drop -- this was originally implemented because
// we could had already scheduled the deletion at the time.
//
// FIXME: this is not true anymore, we can safely evict wanted deleted files.
if gc {
// do nothing now, only in LayerInner::drop
} else if can_evict && evict {
// If timeline is alive, we can construct a span with IDs for this function.
let span = self.timeline.upgrade().map(|timeline| {
tracing::info_span!(parent: None, "layer_evict", tenant_id = %timeline.tenant_shard_id.tenant_id, shard_id=%timeline.tenant_shard_id.shard_slug(), timeline_id = %timeline.timeline_id)
});
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
// downgrade for queueing, in case there's a tear down already ongoing we should not
// hold it alive.
@@ -1049,9 +1008,9 @@ impl LayerInner {
// drop while the `self.inner` is being locked, leading to a deadlock.
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
let _g = span.map(|s| s.entered());
let _g = span.entered();
// if LayerInner is already dropped here, do nothing because the delete on drop
// if LayerInner is already dropped here, do nothing because the garbage collection
// has already ran while we were in queue
let Some(this) = this.upgrade() else {
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
@@ -1109,9 +1068,7 @@ impl LayerInner {
LayerResidenceEventReason::ResidenceChange,
);
let local_path = self.local_path(&timeline);
let res = match capture_mtime_and_remove(&local_path) {
let res = match capture_mtime_and_remove(&self.path) {
Ok(local_layer_mtime) => {
let duration = SystemTime::now().duration_since(local_layer_mtime);
match duration {
@@ -1263,11 +1220,6 @@ impl DownloadedLayer {
owner: &Arc<LayerInner>,
ctx: &RequestContext,
) -> anyhow::Result<&'a LayerKind> {
let timeline = owner
.timeline
.upgrade()
.ok_or(DownloadError::TimelineShutdown)?;
let init = || async {
assert_eq!(
Weak::as_ptr(&self.owner),
@@ -1277,23 +1229,23 @@ impl DownloadedLayer {
let res = if owner.desc.is_delta {
let summary = Some(delta_layer::Summary::expected(
timeline.tenant_shard_id.tenant_id,
timeline.timeline_id,
owner.desc.tenant_id,
owner.desc.timeline_id,
owner.desc.key_range.clone(),
owner.desc.lsn_range.clone(),
));
delta_layer::DeltaLayerInner::load(&owner.local_path(&timeline), summary, ctx)
delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
.await
.map(|res| res.map(LayerKind::Delta))
} else {
let lsn = owner.desc.image_layer_lsn();
let summary = Some(image_layer::Summary::expected(
timeline.tenant_shard_id.tenant_id,
timeline.timeline_id,
owner.desc.tenant_id,
owner.desc.timeline_id,
owner.desc.key_range.clone(),
lsn,
));
image_layer::ImageLayerInner::load(&owner.local_path(&timeline), lsn, summary, ctx)
image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
.await
.map(|res| res.map(LayerKind::Image))
};
@@ -1417,14 +1369,8 @@ impl ResidentLayer {
}
}
pub(crate) fn local_path_from_id(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
) -> Utf8PathBuf {
self.owner
.0
.local_path_from_id(tenant_shard_id, timeline_id)
pub(crate) fn local_path(&self) -> &Utf8Path {
&self.owner.0.path
}
pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -1455,38 +1401,36 @@ impl From<ResidentLayer> for Layer {
}
}
use metrics::IntCounter;
use metrics::{IntCounter, IntCounterVec};
pub(crate) struct LayerImplMetrics {
struct LayerImplMetrics {
started_evictions: IntCounter,
completed_evictions: IntCounter,
cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
cancelled_evictions: IntCounterVec,
started_deletes: IntCounter,
completed_deletes: IntCounter,
failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
started_gcs: IntCounter,
completed_gcs: IntCounter,
failed_gcs: IntCounterVec,
rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
rare_counters: IntCounterVec,
}
impl Default for LayerImplMetrics {
fn default() -> Self {
use enum_map::Enum;
// reminder: these will be pageserver_layer_* with "_total" suffix
let started_evictions = metrics::register_int_counter!(
"pageserver_layer_started_evictions",
"Evictions started in the Layer implementation"
)
.unwrap();
let completed_evictions = metrics::register_int_counter!(
"pageserver_layer_completed_evictions",
"Evictions completed in the Layer implementation"
let evictions = metrics::register_int_counter_vec!(
"pageserver_layer_evictions_count",
"Evictions started and completed in the Layer implementation",
&["state"]
)
.unwrap();
let started_evictions = evictions
.get_metric_with_label_values(&["started"])
.unwrap();
let completed_evictions = evictions
.get_metric_with_label_values(&["completed"])
.unwrap();
let cancelled_evictions = metrics::register_int_counter_vec!(
"pageserver_layer_cancelled_evictions_count",
"Different reasons for evictions to have been cancelled or failed",
@@ -1494,36 +1438,24 @@ impl Default for LayerImplMetrics {
)
.unwrap();
let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let reason = EvictionCancelled::from_usize(i);
let s = reason.as_str();
cancelled_evictions.with_label_values(&[s])
}));
let started_deletes = metrics::register_int_counter!(
"pageserver_layer_started_deletes",
"Deletions on drop pending in the Layer implementation"
)
.unwrap();
let completed_deletes = metrics::register_int_counter!(
"pageserver_layer_completed_deletes",
"Deletions on drop completed in the Layer implementation"
// reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
let gcs = metrics::register_int_counter_vec!(
"pageserver_layer_gcs_count",
"Garbage collections started and completed in the Layer implementation",
&["state"]
)
.unwrap();
let failed_deletes = metrics::register_int_counter_vec!(
"pageserver_layer_failed_deletes_count",
"Different reasons for deletions on drop to have failed",
let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
let failed_gcs = metrics::register_int_counter_vec!(
"pageserver_layer_failed_gcs_count",
"Different reasons for garbage collections to have failed",
&["reason"]
)
.unwrap();
let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let reason = DeleteFailed::from_usize(i);
let s = reason.as_str();
failed_deletes.with_label_values(&[s])
}));
let rare_counters = metrics::register_int_counter_vec!(
"pageserver_layer_assumed_rare_count",
"Times unexpected or assumed rare event happened",
@@ -1531,29 +1463,16 @@ impl Default for LayerImplMetrics {
)
.unwrap();
let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
let event = RareEvent::from_usize(i);
let s = event.as_str();
rare_counters.with_label_values(&[s])
}));
let inits_cancelled = metrics::register_int_counter!(
"pageserver_layer_inits_cancelled_count",
"Times Layer initialization was cancelled",
)
.unwrap();
Self {
started_evictions,
completed_evictions,
cancelled_evictions,
started_deletes,
completed_deletes,
failed_deletes,
started_gcs,
completed_gcs,
failed_gcs,
rare_counters,
inits_cancelled,
}
}
}
@@ -1566,33 +1485,57 @@ impl LayerImplMetrics {
self.completed_evictions.inc();
}
fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
self.cancelled_evictions[reason].inc()
self.cancelled_evictions
.get_metric_with_label_values(&[reason.as_str()])
.unwrap()
.inc()
}
fn inc_started_deletes(&self) {
self.started_deletes.inc();
fn inc_started_gcs(&self) {
self.started_gcs.inc();
}
fn inc_completed_deletes(&self) {
self.completed_deletes.inc();
fn inc_completed_gcs(&self) {
self.completed_gcs.inc();
}
fn inc_deletes_failed(&self, reason: DeleteFailed) {
self.failed_deletes[reason].inc();
fn inc_gcs_failed(&self, reason: GcFailed) {
self.failed_gcs
.get_metric_with_label_values(&[reason.as_str()])
.unwrap()
.inc();
}
/// Counted separatedly from failed layer deletes because we will complete the layer deletion
/// attempt regardless of failure to delete local file.
fn inc_delete_removes_failed(&self) {
self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
/// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
/// failure to delete local file.
fn inc_gc_removes_failed(&self) {
self.rare_counters
.get_metric_with_label_values(&["gc_remove_failed"])
.unwrap()
.inc();
}
/// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
/// Expected rare because requires a race with `evict_blocking` and
/// `get_or_maybe_download`.
fn inc_retried_get_or_maybe_download(&self) {
self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
self.rare_counters
.get_metric_with_label_values(&["retried_gomd"])
.unwrap()
.inc();
}
/// Expected rare because cancellations are unexpected, and failures are unexpected
/// Expected rare because cancellations are unexpected
fn inc_download_completed_without_requester(&self) {
self.rare_counters
.get_metric_with_label_values(&["download_completed_without"])
.unwrap()
.inc();
}
/// Expected rare because cancellations are unexpected
fn inc_download_failed_without_requester(&self) {
self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
self.rare_counters
.get_metric_with_label_values(&["download_failed_without"])
.unwrap()
.inc();
}
/// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
@@ -1600,30 +1543,37 @@ impl LayerImplMetrics {
/// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
/// Option.
fn inc_raced_wanted_evicted_accesses(&self) {
self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
self.rare_counters
.get_metric_with_label_values(&["raced_wanted_evicted"])
.unwrap()
.inc();
}
/// These are only expected for [`Self::inc_init_cancelled`] amount when
/// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
/// running with remote storage.
fn inc_init_needed_no_download(&self) {
self.rare_counters[RareEvent::InitWithoutDownload].inc();
self.rare_counters
.get_metric_with_label_values(&["init_needed_no_download"])
.unwrap()
.inc();
}
/// Expected rare because all layer files should be readable and good
fn inc_permanent_loading_failures(&self) {
self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
self.rare_counters
.get_metric_with_label_values(&["permanent_loading_failure"])
.unwrap()
.inc();
}
fn inc_broadcast_lagged(&self) {
self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
}
fn inc_init_cancelled(&self) {
self.inits_cancelled.inc()
self.rare_counters
.get_metric_with_label_values(&["broadcast_lagged"])
.unwrap()
.inc();
}
}
#[derive(enum_map::Enum)]
enum EvictionCancelled {
LayerGone,
TimelineGone,
@@ -1652,47 +1602,19 @@ impl EvictionCancelled {
}
}
#[derive(enum_map::Enum)]
enum DeleteFailed {
enum GcFailed {
TimelineGone,
DeleteSchedulingFailed,
}
impl DeleteFailed {
impl GcFailed {
fn as_str(&self) -> &'static str {
match self {
DeleteFailed::TimelineGone => "timeline_gone",
DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
GcFailed::TimelineGone => "timeline_gone",
GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
}
}
}
#[derive(enum_map::Enum)]
enum RareEvent {
RemoveOnDropFailed,
RetriedGetOrMaybeDownload,
DownloadFailedWithoutRequester,
UpgradedWantedEvicted,
InitWithoutDownload,
PermanentLoadingFailure,
EvictAndWaitLagged,
}
impl RareEvent {
fn as_str(&self) -> &'static str {
use RareEvent::*;
match self {
RemoveOnDropFailed => "remove_on_drop_failed",
RetriedGetOrMaybeDownload => "retried_gomd",
DownloadFailedWithoutRequester => "download_failed_without",
UpgradedWantedEvicted => "raced_wanted_evicted",
InitWithoutDownload => "init_needed_no_download",
PermanentLoadingFailure => "permanent_loading_failure",
EvictAndWaitLagged => "broadcast_lagged",
}
}
}
pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
once_cell::sync::Lazy::new(LayerImplMetrics::default);

View File

@@ -1,6 +1,9 @@
use core::fmt::Display;
use std::ops::Range;
use utils::lsn::Lsn;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
};
use crate::repository::Key;
@@ -13,6 +16,8 @@ use serde::{Deserialize, Serialize};
/// a unified way to generate layer information like file name.
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct PersistentLayerDesc {
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
/// Range of keys that this layer covers
pub key_range: Range<Key>,
/// Inclusive start, exclusive end of the LSN range that this layer holds.
@@ -51,6 +56,8 @@ impl PersistentLayerDesc {
#[cfg(test)]
pub fn new_test(key_range: Range<Key>) -> Self {
Self {
tenant_id: TenantId::generate(),
timeline_id: TimelineId::generate(),
key_range,
lsn_range: Lsn(0)..Lsn(1),
is_delta: false,
@@ -58,8 +65,16 @@ impl PersistentLayerDesc {
}
}
pub fn new_img(key_range: Range<Key>, lsn: Lsn, file_size: u64) -> Self {
pub fn new_img(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn: Lsn,
file_size: u64,
) -> Self {
Self {
tenant_id,
timeline_id,
key_range,
lsn_range: Self::image_layer_lsn_range(lsn),
is_delta: false,
@@ -67,8 +82,16 @@ impl PersistentLayerDesc {
}
}
pub fn new_delta(key_range: Range<Key>, lsn_range: Range<Lsn>, file_size: u64) -> Self {
pub fn new_delta(
tenant_id: TenantId,
timeline_id: TimelineId,
key_range: Range<Key>,
lsn_range: Range<Lsn>,
file_size: u64,
) -> Self {
Self {
tenant_id,
timeline_id,
key_range,
lsn_range,
is_delta: true,
@@ -76,10 +99,19 @@ impl PersistentLayerDesc {
}
}
pub fn from_filename(filename: LayerFileName, file_size: u64) -> Self {
pub fn from_filename(
tenant_id: TenantId,
timeline_id: TimelineId,
filename: LayerFileName,
file_size: u64,
) -> Self {
match filename {
LayerFileName::Image(i) => Self::new_img(i.key_range, i.lsn, file_size),
LayerFileName::Delta(d) => Self::new_delta(d.key_range, d.lsn_range, file_size),
LayerFileName::Image(i) => {
Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
}
LayerFileName::Delta(d) => {
Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
}
}
}
@@ -136,6 +168,14 @@ impl PersistentLayerDesc {
self.key_range.clone()
}
pub fn get_timeline_id(&self) -> TimelineId {
self.timeline_id
}
pub fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
/// Does this layer only contain some data for the key-range (incremental),
/// or does it contain a version of every page? This is important to know
/// for garbage collecting old layers: an incremental layer depends on
@@ -151,7 +191,9 @@ impl PersistentLayerDesc {
pub fn dump(&self) {
if self.is_delta {
println!(
"----- delta layer keys {}-{} lsn {}-{} is_incremental {} size {} ----",
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.lsn_range.start,
@@ -161,7 +203,9 @@ impl PersistentLayerDesc {
);
} else {
println!(
"----- image layer key {}-{} at {} is_incremental {} size {} ----",
"----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
self.tenant_id,
self.timeline_id,
self.key_range.start,
self.key_range.end,
self.image_layer_lsn(),

View File

@@ -44,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
Eviction,
ConsumptionMetricsCollectMetrics,
ConsumptionMetricsSyntheticSizeWorker,
InitialLogicalSizeCalculation,
}
impl BackgroundLoopKind {
@@ -87,7 +86,7 @@ pub fn start_background_loops(
tenant: &Arc<Tenant>,
background_jobs_can_start: Option<&completion::Barrier>,
) {
let tenant_id = tenant.tenant_shard_id.tenant_id;
let tenant_id = tenant.tenant_id;
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Compaction,

File diff suppressed because it is too large Load Diff

View File

@@ -4,10 +4,13 @@ use std::{
};
use anyhow::Context;
use pageserver_api::{models::TimelineState, shard::TenantShardId};
use pageserver_api::models::TimelineState;
use tokio::sync::OwnedMutexGuard;
use tracing::{debug, error, info, instrument, warn, Instrument, Span};
use utils::{crashsafe, fs_ext, id::TimelineId};
use utils::{
crashsafe, fs_ext,
id::{TenantId, TimelineId},
};
use crate::{
config::PageServerConf,
@@ -21,6 +24,7 @@ use crate::{
},
CreateTimelineCause, DeleteTimelineError, Tenant,
},
InitializationOrder,
};
use super::{Timeline, TimelineResources};
@@ -43,7 +47,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
// Shut down the layer flush task before the remote client, as one depends on the other
task_mgr::shutdown_tasks(
Some(TaskKind::LayerFlushTask),
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.tenant_id),
Some(timeline.timeline_id),
)
.await;
@@ -69,12 +73,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
// NB: This and other delete_timeline calls do not run as a task_mgr task,
// so, they are not affected by this shutdown_tasks() call.
info!("waiting for timeline tasks to shutdown");
task_mgr::shutdown_tasks(
None,
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.timeline_id),
)
.await;
task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
Err(anyhow::anyhow!(
@@ -126,7 +125,7 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
// pub(super): documentation link
pub(super) async fn delete_local_layer_files(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline: &Timeline,
) -> anyhow::Result<()> {
let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
@@ -140,7 +139,7 @@ pub(super) async fn delete_local_layer_files(
// NB: storage_sync upload tasks that reference these layers have been cancelled
// by the caller.
let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
fail::fail_point!("timeline-delete-before-rm", |_| {
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
@@ -176,7 +175,7 @@ pub(super) async fn delete_local_layer_files(
return Ok(());
}
let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
#[cfg(feature = "testing")]
@@ -251,11 +250,11 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
// (nothing can fail after its deletion)
async fn cleanup_remaining_timeline_fs_traces(
conf: &PageServerConf,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> anyhow::Result<()> {
// Remove local metadata
tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("remove metadata")?;
@@ -267,7 +266,7 @@ async fn cleanup_remaining_timeline_fs_traces(
});
// Remove timeline dir
tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("timeline dir")?;
@@ -282,7 +281,7 @@ async fn cleanup_remaining_timeline_fs_traces(
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let timeline_path = conf.timelines_path(&tenant_shard_id);
let timeline_path = conf.timelines_path(&tenant_id);
crashsafe::fsync_async(timeline_path)
.await
.context("fsync_pre_mark_remove")?;
@@ -290,7 +289,7 @@ async fn cleanup_remaining_timeline_fs_traces(
// Remove delete mark
// TODO: once we are confident that no more exist in the field, remove this
// line. It cleans up a legacy marker file that might in rare cases be present.
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
.await
.or_else(fs_ext::ignore_not_found)
.context("remove delete mark")
@@ -356,7 +355,7 @@ impl DeleteTimelineFlow {
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
#[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
pub async fn run(
tenant: &Arc<Tenant>,
timeline_id: TimelineId,
@@ -406,6 +405,7 @@ impl DeleteTimelineFlow {
local_metadata: &TimelineMetadata,
remote_client: Option<RemoteTimelineClient>,
deletion_queue_client: DeletionQueueClient,
init_order: Option<&InitializationOrder>,
) -> anyhow::Result<()> {
// Note: here we even skip populating layer map. Timeline is essentially uninitialized.
// RemoteTimelineClient is the only functioning part.
@@ -418,6 +418,7 @@ impl DeleteTimelineFlow {
remote_client,
deletion_queue_client,
},
init_order,
// Important. We dont pass ancestor above because it can be missing.
// Thus we need to skip the validation here.
CreateTimelineCause::Delete,
@@ -450,8 +451,7 @@ impl DeleteTimelineFlow {
timeline_id: TimelineId,
) -> anyhow::Result<()> {
let r =
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
.await;
cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
info!("Done");
r
}
@@ -522,13 +522,13 @@ impl DeleteTimelineFlow {
tenant: Arc<Tenant>,
timeline: Arc<Timeline>,
) {
let tenant_shard_id = timeline.tenant_shard_id;
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id.tenant_id),
Some(tenant_id),
Some(timeline_id),
"timeline_delete",
false,
@@ -541,7 +541,7 @@ impl DeleteTimelineFlow {
}
.instrument({
let span =
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
span.follows_from(Span::current());
span
}),
@@ -554,14 +554,13 @@ impl DeleteTimelineFlow {
tenant: &Tenant,
timeline: &Timeline,
) -> Result<(), DeleteTimelineError> {
delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
delete_remote_layers_and_index(timeline).await?;
pausable_failpoint!("in_progress_delete");
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
.await?;
cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

View File

@@ -60,12 +60,9 @@ impl Timeline {
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::Eviction,
Some(self.tenant_shard_id.tenant_id),
Some(self.tenant_id),
Some(self.timeline_id),
&format!(
"layer eviction for {}/{}",
self.tenant_shard_id, self.timeline_id
),
&format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
false,
async move {
let cancel = task_mgr::shutdown_token();
@@ -80,7 +77,7 @@ impl Timeline {
);
}
#[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
#[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
use crate::tenant::tasks::random_init_delay;
{
@@ -343,7 +340,7 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
Ok(t) => t,
Err(_) => {
return ControlFlow::Break(());

View File

@@ -1,9 +1,8 @@
use anyhow::{bail, ensure, Context, Result};
use pageserver_api::shard::TenantShardId;
use std::{collections::HashMap, sync::Arc};
use tracing::trace;
use utils::{
id::TimelineId,
id::{TenantId, TimelineId},
lsn::{AtomicLsn, Lsn},
};
@@ -33,11 +32,6 @@ impl LayerManager {
}
}
pub(crate) fn clear(&mut self) {
self.layer_map = LayerMap::default();
self.layer_fmgr.clear();
}
pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
self.layer_fmgr.get_from_desc(desc)
}
@@ -79,7 +73,7 @@ impl LayerManager {
last_record_lsn: Lsn,
conf: &'static PageServerConf,
timeline_id: TimelineId,
tenant_shard_id: TenantShardId,
tenant_id: TenantId,
) -> Result<Arc<InMemoryLayer>> {
ensure!(lsn.is_aligned());
@@ -115,8 +109,7 @@ impl LayerManager {
lsn
);
let new_layer =
InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
let layer = Arc::new(new_layer);
self.layer_map.open_layer = Some(layer.clone());
@@ -248,7 +241,7 @@ impl LayerManager {
// map index without actually rebuilding the index.
updates.remove_historic(desc);
mapping.remove(layer);
layer.delete_on_drop();
layer.garbage_collect_on_drop();
}
pub(crate) fn contains(&self, layer: &Layer) -> bool {
@@ -276,10 +269,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
}
}
pub(crate) fn clear(&mut self) {
self.0.clear();
}
pub(crate) fn contains(&self, layer: &T) -> bool {
self.0.contains_key(&layer.layer_desc().key())
}

View File

@@ -1,10 +1,11 @@
use anyhow::Context;
use once_cell::sync::OnceCell;
use tokio_util::sync::CancellationToken;
use tokio::sync::Semaphore;
use utils::lsn::Lsn;
use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
use std::sync::Arc;
/// Internal structure to hold all data needed for logical size calculation.
///
@@ -22,17 +23,10 @@ pub(super) struct LogicalSize {
///
/// NOTE: size at a given LSN is constant, but after a restart we will calculate
/// the initial size at a different LSN.
pub initial_logical_size: OnceCell<(
u64,
crate::metrics::initial_logical_size::FinishedCalculationGuard,
)>,
pub initial_logical_size: OnceCell<u64>,
/// Cancellation for the best-effort logical size calculation.
///
/// The token is kept in a once-cell so that we can error out if a higher priority
/// request comes in *before* we have started the normal logical size calculation.
pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
OnceCell<CancellationToken>,
/// Semaphore to track ongoing calculation of `initial_logical_size`.
pub initial_size_computation: Arc<tokio::sync::Semaphore>,
/// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
pub initial_part_end: Option<Lsn>,
@@ -58,57 +52,25 @@ pub(super) struct LogicalSize {
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
/// to modify this, it will also keep the prometheus metric in sync.
pub size_added_after_initial: AtomicI64,
/// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
pub(super) did_return_approximate_to_walreceiver: AtomicBool,
}
/// Normalized current size, that the data in pageserver occupies.
#[derive(Debug, Clone, Copy)]
pub(crate) enum CurrentLogicalSize {
pub(super) enum CurrentLogicalSize {
/// The size is not yet calculated to the end, this is an intermediate result,
/// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
/// yet total logical size cannot be below 0.
Approximate(Approximate),
Approximate(u64),
// Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
// available for observation without any calculations.
Exact(Exact),
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub(crate) enum Accuracy {
Approximate,
Exact,
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct Approximate(u64);
#[derive(Debug, Clone, Copy)]
pub(crate) struct Exact(u64);
impl From<&Approximate> for u64 {
fn from(value: &Approximate) -> Self {
value.0
}
}
impl From<&Exact> for u64 {
fn from(val: &Exact) -> Self {
val.0
}
Exact(u64),
}
impl CurrentLogicalSize {
pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
match self {
Self::Approximate(size) => size.into(),
Self::Exact(size) => size.into(),
}
}
pub(crate) fn accuracy(&self) -> Accuracy {
match self {
Self::Approximate(_) => Accuracy::Approximate,
Self::Exact(_) => Accuracy::Exact,
pub(super) fn size(&self) -> u64 {
*match self {
Self::Approximate(size) => size,
Self::Exact(size) => size,
}
}
}
@@ -116,42 +78,36 @@ impl CurrentLogicalSize {
impl LogicalSize {
pub(super) fn empty_initial() -> Self {
Self {
initial_logical_size: OnceCell::with_value((0, {
crate::metrics::initial_logical_size::START_CALCULATION
.first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
.calculation_result_saved()
})),
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
initial_logical_size: OnceCell::with_value(0),
// initial_logical_size already computed, so, don't admit any calculations
initial_size_computation: Arc::new(Semaphore::new(0)),
initial_part_end: None,
size_added_after_initial: AtomicI64::new(0),
did_return_approximate_to_walreceiver: AtomicBool::new(false),
}
}
pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
Self {
initial_logical_size: OnceCell::new(),
cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
initial_size_computation: Arc::new(Semaphore::new(1)),
initial_part_end: Some(compute_to),
size_added_after_initial: AtomicI64::new(0),
did_return_approximate_to_walreceiver: AtomicBool::new(false),
}
}
pub(super) fn current_size(&self) -> CurrentLogicalSize {
pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
// ^^^ keep this type explicit so that the casts in this function break if
// we change the type.
match self.initial_logical_size.get() {
Some((initial_size, _)) => {
CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
Some(initial_size) => {
initial_size.checked_add_signed(size_increment)
.with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
.unwrap()))
.map(CurrentLogicalSize::Exact)
}
None => {
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
}
}
}
@@ -165,7 +121,7 @@ impl LogicalSize {
/// available for re-use. This doesn't contain the incremental part.
pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
match self.initial_part_end {
Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
Some(v) if v == lsn => self.initial_logical_size.get().copied(),
_ => None,
}
}

View File

@@ -43,11 +43,11 @@ impl<'t> UninitializedTimeline<'t> {
/// The caller is responsible for activating the timeline (function `.activate()`).
pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
let timeline_id = self.timeline_id;
let tenant_shard_id = self.owning_tenant.tenant_shard_id;
let tenant_id = self.owning_tenant.tenant_id;
if self.raw_timeline.is_none() {
return Err(anyhow::anyhow!(
"No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
"No timeline for initialization found for {tenant_id}/{timeline_id}"
));
}
@@ -61,13 +61,13 @@ impl<'t> UninitializedTimeline<'t> {
anyhow::ensure!(
new_disk_consistent_lsn.is_valid(),
"new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
"new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
);
let mut timelines = self.owning_tenant.timelines.lock().unwrap();
match timelines.entry(timeline_id) {
Entry::Occupied(_) => anyhow::bail!(
"Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
"Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
),
Entry::Vacant(v) => {
// after taking here should be no fallible operations, because the drop guard will not
@@ -79,7 +79,7 @@ impl<'t> UninitializedTimeline<'t> {
// this should be an assertion.
uninit_mark.remove_uninit_mark().with_context(|| {
format!(
"Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
"Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
)
})?;
v.insert(Arc::clone(&new_timeline));
@@ -134,7 +134,7 @@ impl<'t> UninitializedTimeline<'t> {
.with_context(|| {
format!(
"No raw timeline {}/{} found",
self.owning_tenant.tenant_shard_id, self.timeline_id
self.owning_tenant.tenant_id, self.timeline_id
)
})?
.0)
@@ -144,7 +144,7 @@ impl<'t> UninitializedTimeline<'t> {
impl Drop for UninitializedTimeline<'_> {
fn drop(&mut self) {
if let Some((_, uninit_mark)) = self.raw_timeline.take() {
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
error!("Timeline got dropped without initializing, cleaning its files");
cleanup_timeline_directory(uninit_mark);
}

View File

@@ -71,7 +71,7 @@ impl WalReceiver {
mut broker_client: BrokerClientChannel,
ctx: &RequestContext,
) -> Self {
let tenant_id = timeline.tenant_shard_id.tenant_id;
let tenant_id = timeline.tenant_id;
let timeline_id = timeline.timeline_id;
let walreceiver_ctx =
ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);

View File

@@ -75,7 +75,7 @@ pub(super) async fn connection_manager_loop_step(
}
let id = TenantTimelineId {
tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
tenant_id: connection_manager_state.timeline.tenant_id,
timeline_id: connection_manager_state.timeline.timeline_id,
};
@@ -388,7 +388,7 @@ struct BrokerSkTimeline {
impl ConnectionManagerState {
pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
let id = TenantTimelineId {
tenant_id: timeline.tenant_shard_id.tenant_id,
tenant_id: timeline.tenant_id,
timeline_id: timeline.timeline_id,
};
Self {

View File

@@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
task_mgr::spawn(
WALRECEIVER_RUNTIME.handle(),
TaskKind::WalReceiverConnectionPoller,
Some(timeline.tenant_shard_id.tenant_id),
Some(timeline.tenant_id),
Some(timeline.timeline_id),
"walreceiver connection",
false,
@@ -396,15 +396,11 @@ pub(super) async fn handle_walreceiver_connection(
// Send the replication feedback message.
// Regular standby_status_update fields are put into this message.
let current_timeline_size = timeline
.get_current_logical_size(
crate::tenant::timeline::GetLogicalSizePriority::User,
&ctx,
)
// FIXME: https://github.com/neondatabase/neon/issues/5963
.size_dont_care_about_accuracy();
let (timeline_logical_size, _) = timeline
.get_current_logical_size(&ctx)
.context("Status update creation failed to get current logical size")?;
let status_update = PageserverFeedback {
current_timeline_size,
current_timeline_size: timeline_logical_size,
last_received_lsn,
disk_consistent_lsn,
remote_consistent_lsn,

View File

@@ -610,11 +610,9 @@ impl Drop for VirtualFile {
slot.recently_used.store(false, Ordering::Relaxed);
// there is also operation "close-by-replace" for closes done on eviction for
// comparison.
if let Some(fd) = slot_guard.file.take() {
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(fd));
}
STORAGE_IO_TIME_METRIC
.get(StorageIoOperation::Close)
.observe_closure_duration(|| drop(slot_guard.file.take()));
}
}
}

View File

@@ -98,257 +98,260 @@ impl<'a> WalIngest<'a> {
self.checkpoint_modified = true;
}
match decoded.xl_rmid {
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
// Heap AM records need some special handling, because they modify VM pages
// without registering them with the standard mechanism.
self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
.await?;
}
pg_constants::RM_NEON_ID => {
self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
.await?;
}
// Handle other special record types
pg_constants::RM_SMGR_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_SMGR_CREATE {
let create = XlSmgrCreate::decode(&mut buf);
self.ingest_xlog_smgr_create(modification, &create, ctx)
.await?;
} else if info == pg_constants::XLOG_SMGR_TRUNCATE {
let truncate = XlSmgrTruncate::decode(&mut buf);
self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
.await?;
}
}
pg_constants::RM_DBASE_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
if self.timeline.pg_version == 14 {
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
let createdb = XlCreateDatabase::decode(&mut buf);
debug!("XLOG_DBASE_CREATE v14");
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
} else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
} else if self.timeline.pg_version == 15 {
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_FILE_COPY");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
} else if self.timeline.pg_version == 16 {
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_FILE_COPY");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
}
}
pg_constants::RM_TBLSPC_ID => {
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
}
pg_constants::RM_CLOG_ID => {
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::CLOG_ZEROPAGE {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
modification,
SlruKind::Clog,
segno,
rpageno,
ZERO_PAGE.clone(),
ctx,
)
.await?;
} else {
assert!(info == pg_constants::CLOG_TRUNCATE);
let xlrec = XlClogTruncate::decode(&mut buf);
self.ingest_clog_truncate_record(modification, &xlrec, ctx)
.await?;
}
}
pg_constants::RM_XACT_ID => {
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
self.ingest_xact_record(
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT,
ctx,
)
.await?;
} else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
// Heap AM records need some special handling, because they modify VM pages
// without registering them with the standard mechanism.
if decoded.xl_rmid == pg_constants::RM_HEAP_ID
|| decoded.xl_rmid == pg_constants::RM_HEAP2_ID
{
self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
.await?;
}
if decoded.xl_rmid == pg_constants::RM_NEON_ID {
self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
.await?;
}
// Handle other special record types
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_SMGR_CREATE
{
let create = XlSmgrCreate::decode(&mut buf);
self.ingest_xlog_smgr_create(modification, &create, ctx)
.await?;
} else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_SMGR_TRUNCATE
{
let truncate = XlSmgrTruncate::decode(&mut buf);
self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
.await?;
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
debug!(
"handle RM_DBASE_ID for Postgres version {:?}",
self.timeline.pg_version
);
if self.timeline.pg_version == 14 {
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
{
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
self.ingest_xact_record(
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
ctx,
)
.await?;
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
trace!(
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
decoded.xl_xid,
parsed_xact.xid,
lsn,
);
modification
.drop_twophase_file(parsed_xact.xid, ctx)
.await?;
} else if info == pg_constants::XLOG_XACT_PREPARE {
modification
.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
.await?;
}
}
pg_constants::RM_MULTIXACT_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
let createdb = XlCreateDatabase::decode(&mut buf);
debug!("XLOG_DBASE_CREATE v14");
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
modification,
SlruKind::MultiXactOffsets,
segno,
rpageno,
ZERO_PAGE.clone(),
ctx,
)
.await?;
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
modification,
SlruKind::MultiXactMembers,
segno,
rpageno,
ZERO_PAGE.clone(),
ctx,
)
.await?;
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
self.ingest_multixact_create_record(modification, &xlrec)?;
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
let xlrec = XlMultiXactTruncate::decode(&mut buf);
self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
}
}
pg_constants::RM_RELMAP_ID => {
let xlrec = XlRelmapUpdate::decode(&mut buf);
self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
.await?;
}
pg_constants::RM_XLOG_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_NEXTOID {
let next_oid = buf.get_u32_le();
if self.checkpoint.nextOid != next_oid {
self.checkpoint.nextOid = next_oid;
self.checkpoint_modified = true;
}
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v14::bindings::XLOG_DBASE_DROP
{
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
buf.copy_to_slice(&mut checkpoint_bytes);
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
trace!(
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
xlog_checkpoint.oldestXid,
self.checkpoint.oldestXid
);
if (self
.checkpoint
.oldestXid
.wrapping_sub(xlog_checkpoint.oldestXid) as i32)
< 0
{
self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
self.checkpoint_modified = true;
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
} else if self.timeline.pg_version == 15 {
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG
{
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY
{
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_FILE_COPY");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v15::bindings::XLOG_DBASE_DROP
{
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
} else if self.timeline.pg_version == 16 {
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG
{
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY
{
// The XLOG record was renamed between v14 and v15,
// but the record format is the same.
// So we can reuse XlCreateDatabase here.
debug!("XLOG_DBASE_CREATE_FILE_COPY");
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(modification, &createdb, ctx)
.await?;
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== postgres_ffi::v16::bindings::XLOG_DBASE_DROP
{
let dropdb = XlDropDatabase::decode(&mut buf);
for tablespace_id in dropdb.tablespace_ids {
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
modification
.drop_dbdir(tablespace_id, dropdb.db_id, ctx)
.await?;
}
}
}
pg_constants::RM_LOGICALMSG_ID => {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
} else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
if info == pg_constants::CLOG_ZEROPAGE {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
modification,
SlruKind::Clog,
segno,
rpageno,
ZERO_PAGE.clone(),
ctx,
)
.await?;
} else {
assert!(info == pg_constants::CLOG_TRUNCATE);
let xlrec = XlClogTruncate::decode(&mut buf);
self.ingest_clog_truncate_record(modification, &xlrec, ctx)
.await?;
}
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
self.ingest_xact_record(
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT,
ctx,
)
.await?;
} else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|| info == pg_constants::XLOG_XACT_ABORT_PREPARED
{
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
self.ingest_xact_record(
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
ctx,
)
.await?;
// Remove twophase file. see RemoveTwoPhaseFile() in postgres code
trace!(
"Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
decoded.xl_xid,
parsed_xact.xid,
lsn,
);
modification
.drop_twophase_file(parsed_xact.xid, ctx)
.await?;
} else if info == pg_constants::XLOG_XACT_PREPARE {
modification
.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
.await?;
}
} else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
let xlrec = XlLogicalMessage::decode(&mut buf);
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
if prefix == "neon-test" {
// This is a convenient way to make the WAL ingestion pause at
// particular point in the WAL. For more fine-grained control,
// we could peek into the message and only pause if it contains
// a particular string, for example, but this is enough for now.
crate::failpoint_support::sleep_millis_async!(
"wal-ingest-logical-message-sleep"
);
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
modification.put_file(path, message, ctx).await?;
}
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
modification,
SlruKind::MultiXactOffsets,
segno,
rpageno,
ZERO_PAGE.clone(),
ctx,
)
.await?;
} else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
let pageno = buf.get_u32_le();
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
modification,
SlruKind::MultiXactMembers,
segno,
rpageno,
ZERO_PAGE.clone(),
ctx,
)
.await?;
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
self.ingest_multixact_create_record(modification, &xlrec)?;
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
let xlrec = XlMultiXactTruncate::decode(&mut buf);
self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
.await?;
}
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
let xlrec = XlRelmapUpdate::decode(&mut buf);
self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
.await?;
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_NEXTOID {
let next_oid = buf.get_u32_le();
if self.checkpoint.nextOid != next_oid {
self.checkpoint.nextOid = next_oid;
self.checkpoint_modified = true;
}
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
{
let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
buf.copy_to_slice(&mut checkpoint_bytes);
let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
trace!(
"xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
xlog_checkpoint.oldestXid,
self.checkpoint.oldestXid
);
if (self
.checkpoint
.oldestXid
.wrapping_sub(xlog_checkpoint.oldestXid) as i32)
< 0
{
self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
self.checkpoint_modified = true;
}
}
_x => {
// TODO: should probably log & fail here instead of blindly
// doing something without understanding the protocol
} else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
let xlrec = XlLogicalMessage::decode(&mut buf);
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
if prefix == "neon-test" {
// This is a convenient way to make the WAL ingestion pause at
// particular point in the WAL. For more fine-grained control,
// we could peek into the message and only pause if it contains
// a particular string, for example, but this is enough for now.
crate::failpoint_support::sleep_millis_async!(
"wal-ingest-logical-message-sleep"
);
} else if let Some(path) = prefix.strip_prefix("neon-file:") {
modification.put_file(path, message, ctx).await?;
}
}
}
@@ -1437,16 +1440,7 @@ impl<'a> WalIngest<'a> {
// record.
// TODO: would be nice if to be more explicit about it
let last_lsn = modification.lsn;
// Get current size and put rel creation if rel doesn't exist
//
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
// check the cache too. This is because eagerly checking the cache results in
// less work overall and 10% better performance. It's more work on cache miss
// but cache miss is rare.
let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
nblocks
} else if !self
let old_nblocks = if !self
.timeline
.get_rel_exists(rel, last_lsn, true, ctx)
.await?
@@ -2124,7 +2118,7 @@ mod tests {
.load()
.await;
let tline = tenant
.bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
.bootstrap_timeline(TIMELINE_ID, pg_version, &ctx)
.await
.unwrap();

View File

@@ -34,20 +34,17 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use std::time::Duration;
use std::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
#[cfg(feature = "testing")]
use std::sync::atomic::{AtomicUsize, Ordering};
#[cfg(feature = "testing")]
use pageserver_api::shard::TenantShardId;
use crate::config::PageServerConf;
use crate::metrics::{
WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
};
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
use crate::repository::Key;
@@ -123,9 +120,7 @@ impl PostgresRedoManager {
/// The WAL redo is handled by a separate thread, so this just sends a request
/// to the thread and waits for response.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
/// CANCEL SAFETY: NOT CANCEL SAFE.
pub async fn request_redo(
&self,
key: Key,
@@ -158,6 +153,7 @@ impl PostgresRedoManager {
self.conf.wal_redo_timeout,
pg_version,
)
.await
};
img = Some(result?);
@@ -178,6 +174,7 @@ impl PostgresRedoManager {
self.conf.wal_redo_timeout,
pg_version,
)
.await
}
}
}
@@ -215,7 +212,7 @@ impl PostgresRedoManager {
/// Process one request for WAL redo using wal-redo postgres
///
#[allow(clippy::too_many_arguments)]
fn apply_batch_postgres(
async fn apply_batch_postgres(
&self,
key: Key,
lsn: Lsn,
@@ -241,13 +238,10 @@ impl PostgresRedoManager {
let mut proc_guard = self.redo_process.write().unwrap();
match &*proc_guard {
None => {
let timer =
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
let proc = Arc::new(
WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
.context("launch walredo process")?,
);
timer.observe_duration();
*proc_guard = Some(Arc::clone(&proc));
proc
}
@@ -331,7 +325,12 @@ impl PostgresRedoManager {
// than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
// we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
// This probably needs revisiting at some later point.
let mut wait_done = proc.stderr_logger_task_done.clone();
drop(proc);
wait_done
.wait_for(|v| *v)
.await
.expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
} else if n_attempts != 0 {
info!(n_attempts, "retried walredo succeeded");
}
@@ -643,6 +642,8 @@ struct WalRedoProcess {
child: Option<NoLeakChild>,
stdout: Mutex<ProcessOutput>,
stdin: Mutex<ProcessInput>,
stderr_logger_cancel: CancellationToken,
stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
/// Counter to separate same sized walredo inputs failing at the same millisecond.
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize,
@@ -691,8 +692,6 @@ impl WalRedoProcess {
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
let stderr = tokio::process::ChildStderr::from_std(stderr)
.context("convert to tokio::ChildStderr")?;
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
@@ -704,45 +703,69 @@ impl WalRedoProcess {
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
set_nonblock_or_log_err!(stderr)?;
let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
tokio::spawn(
let stderr_logger_cancel = CancellationToken::new();
let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
tokio::sync::watch::channel(false);
tokio::spawn({
let stderr_logger_cancel = stderr_logger_cancel.clone();
async move {
scopeguard::defer! {
debug!("wal-redo-postgres stderr_logger_task finished");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
let _ = stderr_logger_task_done_tx.send(true);
}
debug!("wal-redo-postgres stderr_logger_task started");
crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
use tokio::io::AsyncBufReadExt;
let mut stderr_lines = tokio::io::BufReader::new(stderr);
let mut buf = Vec::new();
let res = loop {
buf.clear();
// TODO we don't trust the process to cap its stderr length.
// Currently it can do unbounded Vec allocation.
match stderr_lines.read_until(b'\n', &mut buf).await {
Ok(0) => break Ok(()), // eof
Ok(num_bytes) => {
let output = String::from_utf8_lossy(&buf[..num_bytes]);
error!(%output, "received output");
loop {
// NB: we purposefully don't do a select! for the cancellation here.
// The cancellation would likely cause us to miss stderr messages.
// We can rely on this to return from .await because when we SIGKILL
// the child, the writing end of the stderr pipe gets closed.
match stderr.readable_mut().await {
Ok(mut guard) => {
let mut errbuf = [0; 16384];
let res = guard.try_io(|fd| {
use std::io::Read;
fd.get_mut().read(&mut errbuf)
});
match res {
Ok(Ok(0)) => {
// it closed the stderr pipe
break;
}
Ok(Ok(n)) => {
// The message might not be split correctly into lines here. But this is
// good enough, the important thing is to get the message to the log.
let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
error!(output, "received output");
},
Ok(Err(e)) => {
error!(error = ?e, "read() error, waiting for cancellation");
stderr_logger_cancel.cancelled().await;
error!(error = ?e, "read() error, cancellation complete");
break;
}
Err(e) => {
let _e: tokio::io::unix::TryIoError = e;
// the read() returned WouldBlock, that's expected
}
}
}
Err(e) => {
break Err(e);
error!(error = ?e, "read() error, waiting for cancellation");
stderr_logger_cancel.cancelled().await;
error!(error = ?e, "read() error, cancellation complete");
break;
}
}
};
match res {
Ok(()) => (),
Err(e) => {
error!(error=?e, "failed to read from walredo stderr");
}
}
}.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
);
});
Ok(Self {
conf,
@@ -757,6 +780,8 @@ impl WalRedoProcess {
pending_responses: VecDeque::new(),
n_processed_responses: 0,
}),
stderr_logger_cancel,
stderr_logger_task_done: stderr_logger_task_done_rx,
#[cfg(feature = "testing")]
dump_sequence: AtomicUsize::default(),
})
@@ -966,11 +991,7 @@ impl WalRedoProcess {
// these files will be collected to an allure report
let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
// TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
let path = self
.conf
.tenant_path(&TenantShardId::unsharded(self.tenant_id))
.join(&filename);
let path = self.conf.tenant_path(&self.tenant_id).join(&filename);
let res = std::fs::OpenOptions::new()
.write(true)
@@ -997,6 +1018,7 @@ impl Drop for WalRedoProcess {
.take()
.expect("we only do this once")
.kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
self.stderr_logger_cancel.cancel();
// no way to wait for stderr_logger_task from Drop because that is async only
}
}

View File

@@ -59,7 +59,6 @@
#include "replication/walsender.h"
#include "storage/bufmgr.h"
#include "storage/buf_internals.h"
#include "storage/fsm_internals.h"
#include "storage/smgr.h"
#include "storage/md.h"
#include "pgstat.h"
@@ -2723,86 +2722,6 @@ smgr_init_neon(void)
}
static void
neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
{
BlockNumber relsize;
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{
if (relsize < blkno + 1)
{
update_cached_relsize(rinfo, forknum, blkno + 1);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
}
}
else
{
/*
* Size was not cached. We populate the cache now, with the size of the
* relation measured after this WAL record is applied.
*
* This length is later reused when we open the smgr to read the block,
* which is fine and expected.
*/
NeonResponse *response;
NeonNblocksResponse *nbresponse;
NeonNblocksRequest request = {
.req = (NeonRequest) {
.lsn = end_recptr,
.latest = false,
.tag = T_NeonNblocksRequest,
},
.rinfo = rinfo,
.forknum = forknum,
};
response = page_server_request(&request);
Assert(response->tag == T_NeonNblocksResponse);
nbresponse = (NeonNblocksResponse *) response;
relsize = Max(nbresponse->n_blocks, blkno+1);
set_cached_relsize(rinfo, forknum, relsize);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
elog(SmgrTrace, "Set length to %d", relsize);
}
}
#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
/*
* TODO: May be it is better to make correspondent fgunctio from freespace.c public?
*/
static BlockNumber
get_fsm_physical_block(BlockNumber heapblk)
{
BlockNumber pages;
int leafno;
int l;
/*
* Calculate the logical page number of the first leaf page below the
* given page.
*/
leafno = heapblk / SlotsPerFSMPage;
/* Count upper level nodes required to address the leaf page */
pages = 0;
for (l = 0; l < FSM_TREE_DEPTH; l++)
{
pages += leafno + 1;
leafno /= SlotsPerFSMPage;
}
/* Turn the page count into 0-based block number */
return pages - 1;
}
/*
* Return whether we can skip the redo for this block.
*
@@ -2850,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
LWLock *partitionLock;
Buffer buffer;
bool no_redo_needed;
BlockNumber relsize;
if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
return true;
@@ -2899,10 +2819,49 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
LWLockRelease(partitionLock);
neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
if (forknum == MAIN_FORKNUM)
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{
neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
if (relsize < blkno + 1)
{
update_cached_relsize(rinfo, forknum, blkno + 1);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
}
}
else
{
/*
* Size was not cached. We populate the cache now, with the size of the
* relation measured after this WAL record is applied.
*
* This length is later reused when we open the smgr to read the block,
* which is fine and expected.
*/
NeonResponse *response;
NeonNblocksResponse *nbresponse;
NeonNblocksRequest request = {
.req = (NeonRequest) {
.lsn = end_recptr,
.latest = false,
.tag = T_NeonNblocksRequest,
},
.rinfo = rinfo,
.forknum = forknum,
};
response = page_server_request(&request);
Assert(response->tag == T_NeonNblocksResponse);
nbresponse = (NeonNblocksResponse *) response;
Assert(nbresponse->n_blocks > blkno);
set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
}
return no_redo_needed;
}

20
poetry.lock generated
View File

@@ -1967,18 +1967,18 @@ pytest = [
[[package]]
name = "pytest-rerunfailures"
version = "13.0"
version = "11.1.2"
description = "pytest plugin to re-run tests to eliminate flaky failures"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
{file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
{file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
{file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
]
[package.dependencies]
packaging = ">=17.1"
pytest = ">=7"
pytest = ">=5.3"
[[package]]
name = "pytest-split"
@@ -2476,6 +2476,16 @@ files = [
{file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
{file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
{file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
{file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
{file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
{file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
{file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
{file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
{file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2697,4 +2707,4 @@ cffi = ["cffi (>=1.11)"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
content-hash = "25ffa9ed98d890a3b85e6036792296a60bb705e8f9eaa1f07336501116a58756"

View File

@@ -24,7 +24,6 @@ hostname.workspace = true
humantime.workspace = true
hyper-tungstenite.workspace = true
hyper.workspace = true
ipnet.workspace = true
itertools.workspace = true
md5.workspace = true
metrics.workspace = true
@@ -69,7 +68,6 @@ webpki-roots.workspace = true
x509-parser.workspace = true
native-tls.workspace = true
postgres-native-tls.workspace = true
smol_str.workspace = true
workspace_hack.workspace = true
tokio-util.workspace = true

View File

@@ -4,7 +4,7 @@ pub mod backend;
pub use backend::BackendType;
mod credentials;
pub use credentials::{check_peer_addr_is_in_list, ClientCredentials};
pub use credentials::ClientCredentials;
mod password_hack;
pub use password_hack::parse_endpoint_param;
@@ -56,12 +56,6 @@ pub enum AuthErrorImpl {
/// Errors produced by e.g. [`crate::stream::PqStream`].
#[error(transparent)]
Io(#[from] io::Error),
#[error(
"This IP address is not allowed to connect to this endpoint. \
Please add it to the allowed list in the Neon console."
)]
IpAddressNotAllowed,
}
#[derive(Debug, Error)]
@@ -76,10 +70,6 @@ impl AuthError {
pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
AuthErrorImpl::AuthFailed(user.into()).into()
}
pub fn ip_address_not_allowed() -> Self {
AuthErrorImpl::IpAddressNotAllowed.into()
}
}
impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -101,7 +91,6 @@ impl UserFacingError for AuthError {
MalformedPassword(_) => self.to_string(),
MissingEndpointName => self.to_string(),
Io(_) => "Internal error".to_string(),
IpAddressNotAllowed => self.to_string(),
}
}
}

View File

@@ -5,12 +5,7 @@ mod link;
pub use link::LinkAuthError;
use tokio_postgres::config::AuthKeys;
use crate::auth::credentials::check_peer_addr_is_in_list;
use crate::console::errors::GetAuthInfoError;
use crate::console::provider::AuthInfo;
use crate::console::AuthSecret;
use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
use crate::scram;
use crate::stream::Stream;
use crate::{
auth::{self, ClientCredentials},
@@ -25,7 +20,6 @@ use crate::{
use futures::TryFutureExt;
use std::borrow::Cow;
use std::ops::ControlFlow;
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{error, info, warn};
@@ -70,7 +64,6 @@ pub enum BackendType<'a, T> {
pub trait TestBackend: Send + Sync + 'static {
fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
fn get_allowed_ips(&self) -> Result<Arc<Vec<String>>, console::errors::GetAuthInfoError>;
}
impl std::fmt::Display for BackendType<'_, ()> {
@@ -147,38 +140,14 @@ async fn auth_quirks_creds(
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the endpoint (project) name.
// We now expect to see a very specific payload in the place of password.
let maybe_success = if creds.project.is_none() {
if creds.project.is_none() {
// Password will be checked by the compute node later.
Some(hacks::password_hack(creds, client, latency_timer).await?)
} else {
None
};
return hacks::password_hack(creds, client, latency_timer).await;
}
// Password hack should set the project name.
// TODO: make `creds.project` more type-safe.
assert!(creds.project.is_some());
info!("fetching user's authentication info");
// TODO(anna): this will slow down both "hacks" below; we probably need a cache.
let AuthInfo {
secret,
allowed_ips,
} = api.get_auth_info(extra, creds).await?;
// check allowed list
if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed());
}
let secret = secret.unwrap_or_else(|| {
// If we don't have an authentication secret, we mock one to
// prevent malicious probing (possible due to missing protocol steps).
// This mocked secret will never lead to successful authentication.
info!("authentication info not found, mocking it");
AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
});
if let Some(success) = maybe_success {
return Ok(success);
}
// Perform cleartext auth if we're allowed to do that.
// Currently, we use it for websocket connections (latency).
@@ -188,7 +157,7 @@ async fn auth_quirks_creds(
}
// Finally, proceed with the main auth flow (SCRAM-based).
classic::authenticate(creds, client, config, latency_timer, secret).await
classic::authenticate(api, extra, creds, client, config, latency_timer).await
}
/// True to its name, this function encapsulates our current auth trade-offs.
@@ -336,19 +305,6 @@ impl BackendType<'_, ClientCredentials<'_>> {
Ok(res)
}
pub async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra<'_>,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
use BackendType::*;
match self {
Console(api, creds) => api.get_allowed_ips(extra, creds).await,
Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
Link(_) => Ok(Arc::new(vec![])),
Test(x) => x.get_allowed_ips(),
}
}
/// When applicable, wake the compute node, gaining its connection info in the process.
/// The link auth flow doesn't support this, so we return [`None`] in that case.
pub async fn wake_compute(

View File

@@ -3,28 +3,38 @@ use crate::{
auth::{self, AuthFlow, ClientCredentials},
compute,
config::AuthenticationConfig,
console::AuthSecret,
console::{self, AuthInfo, ConsoleReqExtra},
proxy::LatencyTimer,
sasl,
sasl, scram,
stream::{PqStream, Stream},
};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
pub(super) async fn authenticate(
api: &impl console::Api,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials<'_>,
client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
config: &'static AuthenticationConfig,
latency_timer: &mut LatencyTimer,
secret: AuthSecret,
) -> auth::Result<AuthSuccess<ComputeCredentials>> {
info!("fetching user's authentication info");
let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
// If we don't have an authentication secret, we mock one to
// prevent malicious probing (possible due to missing protocol steps).
// This mocked secret will never lead to successful authentication.
info!("authentication info not found, mocking it");
AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
});
let flow = AuthFlow::new(client);
let scram_keys = match secret {
AuthSecret::Md5(_) => {
let scram_keys = match info {
AuthInfo::Md5(_) => {
info!("auth endpoint chooses MD5");
return Err(auth::AuthError::bad_auth_method("MD5"));
}
AuthSecret::Scram(secret) => {
AuthInfo::Scram(secret) => {
info!("auth endpoint chooses SCRAM");
let scram = auth::Scram(&secret);

View File

@@ -106,7 +106,7 @@ pub(super) async fn authenticate(
reported_auth_ok: true,
value: NodeInfo {
config,
aux: db_info.aux,
aux: db_info.aux.into(),
allow_self_signed_compute: false, // caller may override
},
})

View File

@@ -7,12 +7,9 @@ use crate::{
};
use itertools::Itertools;
use pq_proto::StartupMessageParams;
use std::{
collections::HashSet,
net::{IpAddr, SocketAddr},
};
use std::collections::HashSet;
use thiserror::Error;
use tracing::{info, warn};
use tracing::info;
#[derive(Debug, Error, PartialEq, Eq, Clone)]
pub enum ClientCredsParseError {
@@ -47,7 +44,6 @@ pub struct ClientCredentials<'a> {
pub project: Option<String>,
pub cache_key: String,
pub peer_addr: SocketAddr,
}
impl ClientCredentials<'_> {
@@ -58,11 +54,19 @@ impl ClientCredentials<'_> {
}
impl<'a> ClientCredentials<'a> {
#[cfg(test)]
pub fn new_noop() -> Self {
ClientCredentials {
user: "",
project: None,
cache_key: "".to_string(),
}
}
pub fn parse(
params: &'a StartupMessageParams,
sni: Option<&str>,
common_names: Option<HashSet<String>>,
peer_addr: SocketAddr,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
@@ -149,59 +153,10 @@ impl<'a> ClientCredentials<'a> {
user,
project,
cache_key,
peer_addr,
})
}
}
pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<String>) -> bool {
if ip_list.is_empty() {
return true;
}
for ip in ip_list {
// We expect that all ip addresses from control plane are correct.
// However, if some of them are broken, we still can check the others.
match parse_ip_pattern(ip) {
Ok(pattern) => {
if check_ip(peer_addr, &pattern) {
return true;
}
}
Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err),
}
}
false
}
#[derive(Debug, Clone, Eq, PartialEq)]
enum IpPattern {
Subnet(ipnet::IpNet),
Range(IpAddr, IpAddr),
Single(IpAddr),
}
fn parse_ip_pattern(pattern: &str) -> anyhow::Result<IpPattern> {
if pattern.contains('/') {
let subnet: ipnet::IpNet = pattern.parse()?;
return Ok(IpPattern::Subnet(subnet));
}
if let Some((start, end)) = pattern.split_once('-') {
let start: IpAddr = start.parse()?;
let end: IpAddr = end.parse()?;
return Ok(IpPattern::Range(start, end));
}
let addr: IpAddr = pattern.parse()?;
Ok(IpPattern::Single(addr))
}
fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool {
match pattern {
IpPattern::Subnet(subnet) => subnet.contains(ip),
IpPattern::Range(start, end) => start <= ip && ip <= end,
IpPattern::Single(addr) => addr == ip,
}
}
fn project_name_valid(name: &str) -> bool {
name.chars().all(|c| c.is_alphanumeric() || c == '-')
}
@@ -221,8 +176,8 @@ mod tests {
fn parse_bare_minimum() -> anyhow::Result<()> {
// According to postgresql, only `user` should be required.
let options = StartupMessageParams::new([("user", "john_doe")]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -236,8 +191,8 @@ mod tests {
("database", "world"), // should be ignored
("foo", "bar"), // should be ignored
]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project, None);
@@ -251,8 +206,7 @@ mod tests {
let sni = Some("foo.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("foo"));
assert_eq!(creds.cache_key, "foo");
@@ -267,8 +221,7 @@ mod tests {
("options", "-ckey=1 project=bar -c geqo=off"),
]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -282,8 +235,7 @@ mod tests {
("options", "-ckey=1 endpoint=bar -c geqo=off"),
]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -300,8 +252,7 @@ mod tests {
),
]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
@@ -315,8 +266,7 @@ mod tests {
("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
]);
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert!(creds.project.is_none());
@@ -330,8 +280,7 @@ mod tests {
let sni = Some("baz.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -344,14 +293,12 @@ mod tests {
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.a.com");
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
let common_names = Some(["a.com".into(), "b.com".into()].into());
let sni = Some("p1.b.com");
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("p1"));
Ok(())
@@ -365,9 +312,7 @@ mod tests {
let sni = Some("second.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
.expect_err("should fail");
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
match err {
InconsistentProjectNames { domain, option } => {
assert_eq!(option, "first");
@@ -384,9 +329,7 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["example.com".into()].into());
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
.expect_err("should fail");
let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
match err {
UnknownCommonName { cn } => {
assert_eq!(cn, "localhost");
@@ -404,8 +347,7 @@ mod tests {
let sni = Some("project.localhost");
let common_names = Some(["localhost".into()].into());
let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
let creds = ClientCredentials::parse(&options, sni, common_names)?;
assert_eq!(creds.project.as_deref(), Some("project"));
assert_eq!(
creds.cache_key,
@@ -414,91 +356,4 @@ mod tests {
Ok(())
}
#[test]
fn test_check_peer_addr_is_in_list() {
let peer_addr = IpAddr::from([127, 0, 0, 1]);
assert!(check_peer_addr_is_in_list(&peer_addr, &vec![]));
assert!(check_peer_addr_is_in_list(
&peer_addr,
&vec!["127.0.0.1".into()]
));
assert!(!check_peer_addr_is_in_list(
&peer_addr,
&vec!["8.8.8.8".into()]
));
// If there is an incorrect address, it will be skipped.
assert!(check_peer_addr_is_in_list(
&peer_addr,
&vec!["88.8.8".into(), "127.0.0.1".into()]
));
}
#[test]
fn test_parse_ip_v4() -> anyhow::Result<()> {
let peer_addr = IpAddr::from([127, 0, 0, 1]);
// Ok
assert_eq!(parse_ip_pattern("127.0.0.1")?, IpPattern::Single(peer_addr));
assert_eq!(
parse_ip_pattern("127.0.0.1/31")?,
IpPattern::Subnet(ipnet::IpNet::new(peer_addr, 31)?)
);
assert_eq!(
parse_ip_pattern("0.0.0.0-200.0.1.2")?,
IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2]))
);
// Error
assert!(parse_ip_pattern("300.0.1.2").is_err());
assert!(parse_ip_pattern("30.1.2").is_err());
assert!(parse_ip_pattern("127.0.0.1/33").is_err());
assert!(parse_ip_pattern("127.0.0.1-127.0.3").is_err());
assert!(parse_ip_pattern("1234.0.0.1-127.0.3.0").is_err());
Ok(())
}
#[test]
fn test_check_ipv4() -> anyhow::Result<()> {
let peer_addr = IpAddr::from([127, 0, 0, 1]);
let peer_addr_next = IpAddr::from([127, 0, 0, 2]);
let peer_addr_prev = IpAddr::from([127, 0, 0, 0]);
// Success
assert!(check_ip(&peer_addr, &IpPattern::Single(peer_addr)));
assert!(check_ip(
&peer_addr,
&IpPattern::Subnet(ipnet::IpNet::new(peer_addr_prev, 31)?)
));
assert!(check_ip(
&peer_addr,
&IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 30)?)
));
assert!(check_ip(
&peer_addr,
&IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2]))
));
assert!(check_ip(
&peer_addr,
&IpPattern::Range(peer_addr, peer_addr)
));
// Not success
assert!(!check_ip(&peer_addr, &IpPattern::Single(peer_addr_prev)));
assert!(!check_ip(
&peer_addr,
&IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 31)?)
));
assert!(!check_ip(
&peer_addr,
&IpPattern::Range(IpAddr::from([0, 0, 0, 0]), peer_addr_prev)
));
assert!(!check_ip(
&peer_addr,
&IpPattern::Range(peer_addr_next, IpAddr::from([128, 0, 0, 0]))
));
// There is no check that for range start <= end. But it's fine as long as for all this cases the result is false.
assert!(!check_ip(
&peer_addr,
&IpPattern::Range(peer_addr, peer_addr_prev)
));
Ok(())
}
}

View File

@@ -284,5 +284,5 @@ async fn handle_client(
let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = Default::default();
proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
}

View File

@@ -1,11 +1,8 @@
use futures::future::Either;
use proxy::auth;
use proxy::config::AuthenticationConfig;
use proxy::config::CacheOptions;
use proxy::config::HttpConfig;
use proxy::console;
use proxy::console::provider::AllowedIpsCache;
use proxy::console::provider::NodeInfoCache;
use proxy::http;
use proxy::rate_limiter::RateLimiterConfig;
use proxy::usage_metrics;
@@ -93,9 +90,6 @@ struct ProxyCliArgs {
/// timeout for http connections
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
sql_over_http_timeout: tokio::time::Duration,
/// Whether the SQL over http pool is opt-in
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
sql_over_http_pool_opt_in: bool,
/// timeout for scram authentication protocol
#[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
scram_protocol_timeout: tokio::time::Duration,
@@ -103,7 +97,7 @@ struct ProxyCliArgs {
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
require_client_ip: bool,
/// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
#[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_dynamic_rate_limiter: bool,
/// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
#[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
@@ -116,12 +110,6 @@ struct ProxyCliArgs {
initial_limit: usize,
#[clap(flatten)]
aimd_config: proxy::rate_limiter::AimdConfig,
/// cache for `allowed_ips` (use `size=0` to disable)
#[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
allowed_ips_cache: String,
/// disable ip check for http requests. If it is too time consuming, it could be turned off.
#[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
disable_ip_check_for_http: bool,
}
#[tokio::main]
@@ -250,24 +238,11 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
let auth_backend = match &args.auth_backend {
AuthBackend::Console => {
let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;
let config::CacheOptions { size, ttl } = args.wake_compute_cache.parse()?;
info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
info!("Using NodeInfoCache (wake_compute) with size={size} ttl={ttl:?}");
let caches = Box::leak(Box::new(console::caches::ApiCaches {
node_info: NodeInfoCache::new(
"node_info_cache",
wake_compute_cache_config.size,
wake_compute_cache_config.ttl,
true,
),
allowed_ips: AllowedIpsCache::new(
"allowed_ips_cache",
allowed_ips_cache_config.size,
allowed_ips_cache_config.ttl,
false,
),
node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
}));
let config::WakeComputeLockOptions {
@@ -300,8 +275,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
}
};
let http_config = HttpConfig {
timeout: args.sql_over_http_timeout,
pool_opt_in: args.sql_over_http_pool_opt_in,
sql_over_http_timeout: args.sql_over_http_timeout,
};
let authentication_config = AuthenticationConfig {
scram_protocol_timeout: args.scram_protocol_timeout,
@@ -314,7 +288,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
http_config,
authentication_config,
require_client_ip: args.require_client_ip,
disable_ip_check_for_http: args.disable_ip_check_for_http,
}));
Ok(config)

View File

@@ -55,7 +55,7 @@ pub mod timed_lru {
/// * Whenever a new entry is inserted, the least recently accessed one is evicted.
/// The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`).
///
/// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp.
/// * When the entry is about to be retrieved, we check its expiration timestamp.
/// If the entry has expired, we remove it from the cache; Otherwise we bump the
/// expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong
/// its existence.
@@ -79,8 +79,6 @@ pub mod timed_lru {
/// Default time-to-live of a single entry.
ttl: Duration,
update_ttl_on_retrieval: bool,
}
impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
@@ -101,17 +99,11 @@ pub mod timed_lru {
impl<K: Hash + Eq, V> TimedLru<K, V> {
/// Construct a new LRU cache with timed entries.
pub fn new(
name: &'static str,
capacity: usize,
ttl: Duration,
update_ttl_on_retrieval: bool,
) -> Self {
pub fn new(name: &'static str, capacity: usize, ttl: Duration) -> Self {
Self {
name,
cache: LruCache::new(capacity).into(),
ttl,
update_ttl_on_retrieval,
}
}
@@ -173,9 +165,7 @@ pub mod timed_lru {
let (created_at, expires_at) = (entry.created_at, entry.expires_at);
// Update the deadline and the entry's position in the LRU list.
if self.update_ttl_on_retrieval {
raw_entry.get_mut().expires_at = deadline;
}
raw_entry.get_mut().expires_at = deadline;
raw_entry.to_back();
drop(cache); // drop lock before logging

View File

@@ -1,6 +1,9 @@
use crate::{
auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
error::UserFacingError, proxy::is_neon_param,
auth::parse_endpoint_param,
cancellation::CancelClosure,
console::errors::WakeComputeError,
error::{io_error, UserFacingError},
proxy::is_neon_param,
};
use futures::{FutureExt, TryFutureExt};
use itertools::Itertools;
@@ -25,9 +28,12 @@ pub enum ConnectionError {
#[error("{COULD_NOT_CONNECT}: {0}")]
TlsError(#[from] native_tls::Error),
}
#[error("{COULD_NOT_CONNECT}: {0}")]
WakeComputeError(#[from] WakeComputeError),
impl From<WakeComputeError> for ConnectionError {
fn from(value: WakeComputeError) -> Self {
io_error(value).into()
}
}
impl UserFacingError for ConnectionError {
@@ -40,7 +46,6 @@ impl UserFacingError for ConnectionError {
Some(err) => err.message().to_owned(),
None => err.to_string(),
},
WakeComputeError(err) => err.to_string_client(),
_ => COULD_NOT_CONNECT.to_owned(),
}
}

View File

@@ -19,7 +19,6 @@ pub struct ProxyConfig {
pub http_config: HttpConfig,
pub authentication_config: AuthenticationConfig,
pub require_client_ip: bool,
pub disable_ip_check_for_http: bool,
}
#[derive(Debug)]
@@ -35,8 +34,7 @@ pub struct TlsConfig {
}
pub struct HttpConfig {
pub timeout: tokio::time::Duration,
pub pool_opt_in: bool,
pub sql_over_http_timeout: tokio::time::Duration,
}
pub struct AuthenticationConfig {
@@ -299,7 +297,6 @@ impl CertResolver {
}
/// Helper for cmdline cache options parsing.
#[derive(Debug)]
pub struct CacheOptions {
/// Max number of entries.
pub size: usize,

View File

@@ -6,7 +6,7 @@ pub mod messages;
/// Wrappers for console APIs and their mocks.
pub mod provider;
pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo};
pub use provider::{errors, Api, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo};
/// Various cache-related types.
pub mod caches {

View File

@@ -1,5 +1,4 @@
use serde::Deserialize;
use smol_str::SmolStr;
use std::fmt;
/// Generic error response with human-readable description.
@@ -89,11 +88,11 @@ impl fmt::Debug for DatabaseInfo {
/// Various labels for prometheus metrics.
/// Also known as `ProxyMetricsAuxInfo` in the console.
#[derive(Debug, Deserialize, Clone, Default)]
#[derive(Debug, Deserialize, Default)]
pub struct MetricsAuxInfo {
pub endpoint_id: SmolStr,
pub project_id: SmolStr,
pub branch_id: SmolStr,
pub endpoint_id: Box<str>,
pub project_id: Box<str>,
pub branch_id: Box<str>,
}
impl MetricsAuxInfo {

View File

@@ -204,7 +204,7 @@ pub struct ConsoleReqExtra<'a> {
}
/// Auth secret which is managed by the cloud.
pub enum AuthSecret {
pub enum AuthInfo {
/// Md5 hash of user's password.
Md5([u8; 16]),
@@ -212,13 +212,6 @@ pub enum AuthSecret {
Scram(scram::ServerSecret),
}
#[derive(Default)]
pub struct AuthInfo {
pub secret: Option<AuthSecret>,
/// List of IP addresses allowed for the autorization.
pub allowed_ips: Vec<String>,
}
/// Info for establishing a connection to a compute node.
/// This is what we get after auth succeeded, but not before!
#[derive(Clone)]
@@ -229,7 +222,7 @@ pub struct NodeInfo {
pub config: compute::ConnCfg,
/// Labels for proxy's metrics.
pub aux: MetricsAuxInfo,
pub aux: Arc<MetricsAuxInfo>,
/// Whether we should accept self-signed certificates (for testing)
pub allow_self_signed_compute: bool,
@@ -237,7 +230,6 @@ pub struct NodeInfo {
pub type NodeInfoCache = TimedLru<Arc<str>, NodeInfo>;
pub type CachedNodeInfo = timed_lru::Cached<&'static NodeInfoCache>;
pub type AllowedIpsCache = TimedLru<Arc<str>, Arc<Vec<String>>>;
/// This will allocate per each call, but the http requests alone
/// already require a few allocations, so it should be fine.
@@ -248,13 +240,7 @@ pub trait Api {
&self,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<AuthInfo, errors::GetAuthInfoError>;
async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<Arc<Vec<String>>, errors::GetAuthInfoError>;
) -> Result<Option<AuthInfo>, errors::GetAuthInfoError>;
/// Wake up the compute node and return the corresponding connection info.
async fn wake_compute(
@@ -268,8 +254,6 @@ pub trait Api {
pub struct ApiCaches {
/// Cache for the `wake_compute` API method.
pub node_info: NodeInfoCache,
/// Cache for the `get_allowed_ips`. TODO(anna): use notifications listener instead.
pub allowed_ips: TimedLru<Arc<str>, Arc<Vec<String>>>,
}
/// Various caches for [`console`](super).

View File

@@ -1,16 +1,14 @@
//! Mock console backend which relies on a user-provided postgres instance.
use std::sync::Arc;
use super::{
errors::{ApiError, GetAuthInfoError, WakeComputeError},
AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
use crate::{auth::ClientCredentials, compute, error::io_error, scram, url::ApiUrl};
use async_trait::async_trait;
use futures::TryFutureExt;
use thiserror::Error;
use tokio_postgres::{config::SslMode, Client};
use tokio_postgres::config::SslMode;
use tracing::{error, info, info_span, warn, Instrument};
#[derive(Debug, Error)]
@@ -48,8 +46,8 @@ impl Api {
async fn do_get_auth_info(
&self,
creds: &ClientCredentials<'_>,
) -> Result<AuthInfo, GetAuthInfoError> {
let (secret, allowed_ips) = async {
) -> Result<Option<AuthInfo>, GetAuthInfoError> {
async {
// Perhaps we could persist this connection, but then we'd have to
// write more code for reopening it if it got closed, which doesn't
// seem worth it.
@@ -57,48 +55,32 @@ impl Api {
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
tokio::spawn(connection);
let secret = match get_execute_postgres_query(
&client,
"select rolpassword from pg_catalog.pg_authid where rolname = $1",
&[&creds.user],
"rolpassword",
)
.await?
{
Some(entry) => {
info!("got a secret: {entry}"); // safe since it's not a prod scenario
let secret = scram::ServerSecret::parse(&entry).map(AuthSecret::Scram);
secret.or_else(|| parse_md5(&entry).map(AuthSecret::Md5))
}
let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
let rows = client.query(query, &[&creds.user]).await?;
// We can get at most one row, because `rolname` is unique.
let row = match rows.first() {
Some(row) => row,
// This means that the user doesn't exist, so there can be no secret.
// However, this is still a *valid* outcome which is very similar
// to getting `404 Not found` from the Neon console.
None => {
warn!("user '{}' does not exist", creds.user);
None
return Ok(None);
}
};
let allowed_ips = match get_execute_postgres_query(
&client,
"select allowed_ips from neon_control_plane.endpoints where endpoint_id = $1",
&[&creds.project.clone().unwrap_or_default().as_str()],
"allowed_ips",
)
.await?
{
Some(s) => {
info!("got allowed_ips: {s}");
s.split(',').map(String::from).collect()
}
None => vec![],
};
Ok((secret, allowed_ips))
let entry = row
.try_get("rolpassword")
.map_err(MockApiError::PasswordNotSet)?;
info!("got a secret: {entry}"); // safe since it's not a prod scenario
let secret = scram::ServerSecret::parse(entry).map(AuthInfo::Scram);
Ok(secret.or_else(|| parse_md5(entry).map(AuthInfo::Md5)))
}
.map_err(crate::error::log_error::<GetAuthInfoError>)
.map_err(crate::error::log_error)
.instrument(info_span!("postgres", url = self.endpoint.as_str()))
.await?;
Ok(AuthInfo {
secret,
allowed_ips,
})
.await
}
async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
@@ -118,27 +100,6 @@ impl Api {
}
}
async fn get_execute_postgres_query(
client: &Client,
query: &str,
params: &[&(dyn tokio_postgres::types::ToSql + Sync)],
idx: &str,
) -> Result<Option<String>, GetAuthInfoError> {
let rows = client.query(query, params).await?;
// We can get at most one row, because `rolname` is unique.
let row = match rows.first() {
Some(row) => row,
// This means that the user doesn't exist, so there can be no secret.
// However, this is still a *valid* outcome which is very similar
// to getting `404 Not found` from the Neon console.
None => return Ok(None),
};
let entry = row.try_get(idx).map_err(MockApiError::PasswordNotSet)?;
Ok(Some(entry))
}
#[async_trait]
impl super::Api for Api {
#[tracing::instrument(skip_all)]
@@ -146,18 +107,10 @@ impl super::Api for Api {
&self,
_extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<AuthInfo, GetAuthInfoError> {
) -> Result<Option<AuthInfo>, GetAuthInfoError> {
self.do_get_auth_info(creds).await
}
async fn get_allowed_ips(
&self,
_extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips))
}
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,

View File

@@ -3,17 +3,11 @@
use super::{
super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
errors::{ApiError, GetAuthInfoError, WakeComputeError},
ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
use crate::{
auth::ClientCredentials,
compute, http,
proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
scram,
ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
};
use crate::{auth::ClientCredentials, compute, http, scram};
use async_trait::async_trait;
use futures::TryFutureExt;
use itertools::Itertools;
use std::{net::SocketAddr, sync::Arc};
use tokio::time::Instant;
use tokio_postgres::config::SslMode;
@@ -54,7 +48,7 @@ impl Api {
&self,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials<'_>,
) -> Result<AuthInfo, GetAuthInfoError> {
) -> Result<Option<AuthInfo>, GetAuthInfoError> {
let request_id = uuid::Uuid::new_v4().to_string();
async {
let request = self
@@ -78,25 +72,16 @@ impl Api {
Ok(body) => body,
// Error 404 is special: it's ok not to have a secret.
Err(e) => match e.http_status_code() {
Some(http::StatusCode::NOT_FOUND) => return Ok(AuthInfo::default()),
Some(http::StatusCode::NOT_FOUND) => return Ok(None),
_otherwise => return Err(e.into()),
},
};
let secret = scram::ServerSecret::parse(&body.role_secret)
.map(AuthSecret::Scram)
.map(AuthInfo::Scram)
.ok_or(GetAuthInfoError::BadSecret)?;
let allowed_ips = body
.allowed_ips
.into_iter()
.flatten()
.map(String::from)
.collect_vec();
ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
Ok(AuthInfo {
secret: Some(secret),
allowed_ips,
})
Ok(Some(secret))
}
.map_err(crate::error::log_error)
.instrument(info_span!("http", id = request_id))
@@ -144,7 +129,7 @@ impl Api {
let node = NodeInfo {
config,
aux: body.aux,
aux: body.aux.into(),
allow_self_signed_compute: false,
};
@@ -163,32 +148,10 @@ impl super::Api for Api {
&self,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<AuthInfo, GetAuthInfoError> {
) -> Result<Option<AuthInfo>, GetAuthInfoError> {
self.do_get_auth_info(extra, creds).await
}
async fn get_allowed_ips(
&self,
extra: &ConsoleReqExtra<'_>,
creds: &ClientCredentials,
) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
let key: &str = creds.project().expect("impossible");
if let Some(allowed_ips) = self.caches.allowed_ips.get(key) {
ALLOWED_IPS_BY_CACHE_OUTCOME
.with_label_values(&["hit"])
.inc();
return Ok(Arc::new(allowed_ips.to_vec()));
}
ALLOWED_IPS_BY_CACHE_OUTCOME
.with_label_values(&["miss"])
.inc();
let allowed_ips = Arc::new(self.do_get_auth_info(extra, creds).await?.allowed_ips);
self.caches
.allowed_ips
.insert(key.into(), allowed_ips.clone());
Ok(allowed_ips)
}
#[tracing::instrument(skip_all)]
async fn wake_compute(
&self,

View File

@@ -13,7 +13,7 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::time::Instant;
use tracing::trace;
use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
use crate::{rate_limiter, url::ApiUrl};
use reqwest_middleware::RequestBuilder;
/// This is the preferred way to create new http clients,
@@ -90,13 +90,7 @@ impl Endpoint {
/// Execute a [request](reqwest::Request).
pub async fn execute(&self, request: Request) -> Result<Response, Error> {
let path = request.url().path().to_string();
let start = Instant::now();
let res = self.client.execute(request).await;
CONSOLE_REQUEST_LATENCY
.with_label_values(&[&path])
.observe(start.elapsed().as_secs_f64());
res
self.client.execute(request).await
}
}

View File

@@ -24,7 +24,7 @@ use prometheus::{
IntGaugeVec,
};
use regex::Regex;
use std::{error::Error, io, net::SocketAddr, ops::ControlFlow, sync::Arc, time::Instant};
use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
use tokio::{
io::{AsyncRead, AsyncWrite, AsyncWriteExt},
time,
@@ -110,34 +110,12 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
.unwrap()
});
pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"proxy_console_request_latency",
"Time it took for proxy to establish a connection to the compute endpoint",
// proxy_wake_compute/proxy_get_role_info
&["request"],
// largest bucket = 2^16 * 0.2ms = 13s
exponential_buckets(0.0002, 2.0, 16).unwrap(),
)
.unwrap()
});
pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_allowed_ips_cache_misses",
"Number of cache hits/misses for allowed ips",
// hit/miss
&["outcome"],
)
.unwrap()
});
pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_control_plane_token_acquire_seconds",
"semaphore_control_plane_token_acquire_seconds",
"Time it took for proxy to establish a connection to the compute endpoint",
// largest bucket = 3^16 * 0.05ms = 2.15s
exponential_buckets(0.00005, 3.0, 16).unwrap(),
// largest bucket = 2^16 * 0.5ms = 32s
exponential_buckets(0.0005, 2.0, 16).unwrap(),
)
.unwrap()
});
@@ -160,15 +138,6 @@ pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
.unwrap()
});
pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"proxy_allowed_ips_number",
"Number of allowed ips",
vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
)
.unwrap()
});
pub struct LatencyTimer {
// time since the stopwatch was started
start: Option<Instant>,
@@ -296,7 +265,7 @@ pub async fn task_main(
loop {
tokio::select! {
accept_result = listener.accept() => {
let (socket, peer_addr) = accept_result?;
let (socket, _) = accept_result?;
let session_id = uuid::Uuid::new_v4();
let cancel_map = Arc::clone(&cancel_map);
@@ -305,9 +274,7 @@ pub async fn task_main(
info!("accepted postgres client connection");
let mut socket = WithClientIp::new(socket);
let mut peer_addr = peer_addr;
if let Some(ip) = socket.wait_for_addr().await? {
peer_addr = ip;
tracing::Span::current().record("peer_addr", &tracing::field::display(ip));
} else if config.require_client_ip {
bail!("missing required client IP");
@@ -318,7 +285,7 @@ pub async fn task_main(
.set_nodelay(true)
.context("failed to set socket option")?;
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp, peer_addr).await
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await
}
.instrument(info_span!("handle_client", ?session_id, peer_addr = tracing::field::Empty))
.unwrap_or_else(move |e| {
@@ -408,7 +375,6 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
session_id: uuid::Uuid,
stream: S,
mode: ClientMode,
peer_addr: SocketAddr,
) -> anyhow::Result<()> {
info!(
protocol = mode.protocol_label(),
@@ -442,7 +408,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
let result = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, hostname, common_names, peer_addr))
.map(|_| auth::ClientCredentials::parse(&params, hostname, common_names))
.transpose();
match result {
@@ -877,11 +843,11 @@ async fn prepare_client_connection(
pub async fn proxy_pass(
client: impl AsyncRead + AsyncWrite + Unpin,
compute: impl AsyncRead + AsyncWrite + Unpin,
aux: MetricsAuxInfo,
aux: &MetricsAuxInfo,
) -> anyhow::Result<()> {
let usage = USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(),
branch_id: aux.branch_id.clone(),
endpoint_id: aux.endpoint_id.to_string(),
branch_id: aux.branch_id.to_string(),
});
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
@@ -1032,7 +998,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
// immediately after opening the connection.
let (stream, read_buf) = stream.into_inner();
node.stream.write_all(&read_buf).await?;
proxy_pass(stream, node.stream, aux).await
proxy_pass(stream, node.stream, &aux).await
}
}

View File

@@ -466,10 +466,6 @@ impl TestBackend for TestConnectMechanism {
x => panic!("expecting action {:?}, wake_compute is called instead", x),
}
}
fn get_allowed_ips(&self) -> Result<Arc<Vec<String>>, console::errors::GetAuthInfoError> {
unimplemented!("not used in tests")
}
}
fn helper_create_cached_node_info() -> CachedNodeInfo {

View File

@@ -23,7 +23,6 @@ use hyper::{
Body, Method, Request, Response,
};
use std::net::SocketAddr;
use std::task::Poll;
use std::{future::ready, sync::Arc};
use tls_listener::TlsListener;
@@ -103,7 +102,7 @@ pub async fn task_main(
let session_id = uuid::Uuid::new_v4();
request_handler(
req, config, conn_pool, cancel_map, session_id, sni_name, peer_addr,
req, config, conn_pool, cancel_map, session_id, sni_name,
)
.instrument(info_span!(
"serverless",
@@ -171,7 +170,6 @@ async fn request_handler(
cancel_map: Arc<CancelMap>,
session_id: uuid::Uuid,
sni_hostname: Option<String>,
peer_addr: SocketAddr,
) -> Result<Response<Body>, ApiError> {
let host = request
.headers()
@@ -189,15 +187,9 @@ async fn request_handler(
tokio::spawn(
async move {
if let Err(e) = websocket::serve_websocket(
websocket,
config,
&cancel_map,
session_id,
host,
peer_addr,
)
.await
if let Err(e) =
websocket::serve_websocket(websocket, config, &cancel_map, session_id, host)
.await
{
error!(session_id = ?session_id, "error in websocket connection: {e:#}");
}
@@ -213,7 +205,6 @@ async fn request_handler(
sni_hostname,
conn_pool,
session_id,
peer_addr,
&config.http_config,
)
.await

View File

@@ -8,8 +8,7 @@ use pbkdf2::{
Params, Pbkdf2,
};
use pq_proto::StartupMessageParams;
use smol_str::SmolStr;
use std::{collections::HashMap, net::SocketAddr, sync::Arc};
use std::{collections::HashMap, sync::Arc};
use std::{
fmt,
task::{ready, Poll},
@@ -22,8 +21,7 @@ use tokio::time;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};
use crate::{
auth::{self, check_peer_addr_is_in_list},
console,
auth, console,
proxy::{
neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
NUM_DB_CONNECTIONS_OPENED_COUNTER,
@@ -42,16 +40,16 @@ const MAX_CONNS_PER_ENDPOINT: usize = 20;
#[derive(Debug, Clone)]
pub struct ConnInfo {
pub username: SmolStr,
pub dbname: SmolStr,
pub hostname: SmolStr,
pub password: SmolStr,
pub options: Option<SmolStr>,
pub username: String,
pub dbname: String,
pub hostname: String,
pub password: String,
pub options: Option<String>,
}
impl ConnInfo {
// hm, change to hasher to avoid cloning?
pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
pub fn db_and_user(&self) -> (String, String) {
(self.dbname.clone(), self.username.clone())
}
}
@@ -71,7 +69,7 @@ struct ConnPoolEntry {
// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
// Number of open connections is limited by the `max_conns_per_endpoint`.
pub struct EndpointConnPool {
pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
pools: HashMap<(String, String), DbUserConnPool>,
total_conns: usize,
}
@@ -96,7 +94,7 @@ pub struct GlobalConnPool {
//
// That should be a fairly conteded map, so return reference to the per-endpoint
// pool as early as possible and release the lock.
global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,
/// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
/// That seems like far too much effort, so we're using a relaxed increment counter instead.
@@ -146,7 +144,6 @@ impl GlobalConnPool {
conn_info: &ConnInfo,
force_new: bool,
session_id: uuid::Uuid,
peer_addr: SocketAddr,
) -> anyhow::Result<Client> {
let mut client: Option<ClientInner> = None;
let mut latency_timer = LatencyTimer::new("http");
@@ -206,7 +203,6 @@ impl GlobalConnPool {
conn_id,
session_id,
latency_timer,
peer_addr,
)
.await
} else {
@@ -229,7 +225,6 @@ impl GlobalConnPool {
conn_id,
session_id,
latency_timer,
peer_addr,
)
.await
};
@@ -328,7 +323,7 @@ impl GlobalConnPool {
Ok(())
}
fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
// fast path
if let Some(pool) = self.global_pool.get(endpoint) {
return pool.clone();
@@ -406,7 +401,6 @@ async fn connect_to_compute(
conn_id: uuid::Uuid,
session_id: uuid::Uuid,
latency_timer: LatencyTimer,
peer_addr: SocketAddr,
) -> anyhow::Result<ClientInner> {
let tls = config.tls_config.as_ref();
let common_names = tls.and_then(|tls| tls.common_names.clone());
@@ -417,13 +411,12 @@ async fn connect_to_compute(
("application_name", APP_NAME),
("options", conn_info.options.as_deref().unwrap_or("")),
]);
let creds = auth::ClientCredentials::parse(
&params,
Some(&conn_info.hostname),
common_names,
peer_addr,
)?;
let backend = config.auth_backend.as_ref().map(|_| creds);
let creds = config
.auth_backend
.as_ref()
.map(|_| auth::ClientCredentials::parse(&params, Some(&conn_info.hostname), common_names))
.transpose()?;
let console_options = neon_options(&params);
@@ -432,14 +425,8 @@ async fn connect_to_compute(
application_name: Some(APP_NAME),
options: console_options.as_deref(),
};
// TODO(anna): this is a bit hacky way, consider using console notification listener.
if !config.disable_ip_check_for_http {
let allowed_ips = backend.get_allowed_ips(&extra).await?;
if !check_peer_addr_is_in_list(&peer_addr.ip(), &allowed_ips) {
return Err(auth::AuthError::ip_address_not_allowed().into());
}
}
let node_info = backend
let node_info = creds
.wake_compute(&extra)
.await?
.context("missing cache entry from wake_compute")?;
@@ -452,7 +439,7 @@ async fn connect_to_compute(
},
node_info,
&extra,
&backend,
&creds,
latency_timer,
)
.await
@@ -469,7 +456,7 @@ async fn connect_to_compute_once(
let (client, mut connection) = config
.user(&conn_info.username)
.password(&*conn_info.password)
.password(&conn_info.password)
.dbname(&conn_info.dbname)
.connect_timeout(timeout)
.connect(tokio_postgres::NoTls)
@@ -483,8 +470,8 @@ async fn connect_to_compute_once(
info!(%conn_info, %session, "new connection");
});
let ids = Ids {
endpoint_id: node_info.aux.endpoint_id.clone(),
branch_id: node_info.aux.branch_id.clone(),
endpoint_id: node_info.aux.endpoint_id.to_string(),
branch_id: node_info.aux.branch_id.to_string(),
};
tokio::spawn(

View File

@@ -1,4 +1,3 @@
use std::net::SocketAddr;
use std::sync::Arc;
use anyhow::bail;
@@ -14,7 +13,6 @@ use hyper::{Body, HeaderMap, Request};
use serde_json::json;
use serde_json::Map;
use serde_json::Value;
use tokio_postgres::error::DbError;
use tokio_postgres::types::Kind;
use tokio_postgres::types::Type;
use tokio_postgres::GenericClient;
@@ -182,16 +180,16 @@ fn get_conn_info(
for (key, value) in pairs {
if key == "options" {
options = Some(value.into());
options = Some(value.to_string());
break;
}
}
Ok(ConnInfo {
username: username.into(),
dbname: dbname.into(),
hostname: hostname.into(),
password: password.into(),
username: username.to_owned(),
dbname: dbname.to_owned(),
hostname: hostname.to_owned(),
password: password.to_owned(),
options,
})
}
@@ -202,19 +200,11 @@ pub async fn handle(
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
session_id: uuid::Uuid,
peer_addr: SocketAddr,
config: &'static HttpConfig,
) -> Result<Response<Body>, ApiError> {
let result = tokio::time::timeout(
config.timeout,
handle_inner(
config,
request,
sni_hostname,
conn_pool,
session_id,
peer_addr,
),
config.sql_over_http_timeout,
handle_inner(request, sni_hostname, conn_pool, session_id),
)
.await;
let mut response = match result {
@@ -222,33 +212,14 @@ pub async fn handle(
Ok(r) => r,
Err(e) => {
let message = format!("{:?}", e);
let db_error = e
.downcast_ref::<tokio_postgres::Error>()
.and_then(|e| e.as_db_error());
fn get<'a, T: serde::Serialize>(
db: Option<&'a DbError>,
x: impl FnOnce(&'a DbError) -> T,
) -> Value {
db.map(x)
.and_then(|t| serde_json::to_value(t).ok())
.unwrap_or_default()
}
// TODO(conrad): db_error.position()
let code = get(db_error, |db| db.code().code());
let severity = get(db_error, |db| db.severity());
let detail = get(db_error, |db| db.detail());
let hint = get(db_error, |db| db.hint());
let where_ = get(db_error, |db| db.where_());
let table = get(db_error, |db| db.table());
let column = get(db_error, |db| db.column());
let schema = get(db_error, |db| db.schema());
let datatype = get(db_error, |db| db.datatype());
let constraint = get(db_error, |db| db.constraint());
let file = get(db_error, |db| db.file());
let line = get(db_error, |db| db.line());
let routine = get(db_error, |db| db.routine());
let code = e.downcast_ref::<tokio_postgres::Error>().and_then(|e| {
e.code()
.map(|s| serde_json::to_value(s.code()).unwrap_or_default())
});
let code = match code {
Some(c) => c,
None => Value::Null,
};
error!(
?code,
"sql-over-http per-client task finished with an error: {e:#}"
@@ -256,29 +227,14 @@ pub async fn handle(
// TODO: this shouldn't always be bad request.
json_response(
StatusCode::BAD_REQUEST,
json!({
"message": message,
"code": code,
"detail": detail,
"hint": hint,
"severity": severity,
"where": where_,
"table": table,
"column": column,
"schema": schema,
"datatype": datatype,
"constraint": constraint,
"file": file,
"line": line,
"routine": routine,
}),
json!({ "message": message, "code": code }),
)?
}
},
Err(_) => {
let message = format!(
"HTTP-Connection timed out, execution time exeeded {} seconds",
config.timeout.as_secs()
config.sql_over_http_timeout.as_secs()
);
error!(message);
json_response(
@@ -296,12 +252,10 @@ pub async fn handle(
#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
async fn handle_inner(
config: &'static HttpConfig,
request: Request<Body>,
sni_hostname: Option<String>,
conn_pool: Arc<GlobalConnPool>,
session_id: uuid::Uuid,
peer_addr: SocketAddr,
) -> anyhow::Result<Response<Body>> {
NUM_CONNECTIONS_ACCEPTED_COUNTER
.with_label_values(&["http"])
@@ -322,8 +276,7 @@ async fn handle_inner(
let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
// Allow connection pooling only if explicitly requested
// or if we have decided that http pool is no longer opt-in
let allow_pool = !config.pool_opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
// isolation level, read only and deferrable
@@ -361,9 +314,7 @@ async fn handle_inner(
let body = hyper::body::to_bytes(request.into_body()).await?;
let payload: Payload = serde_json::from_slice(&body)?;
let mut client = conn_pool
.get(&conn_info, !allow_pool, session_id, peer_addr)
.await?;
let mut client = conn_pool.get(&conn_info, !allow_pool, session_id).await?;
let mut response = Response::builder()
.status(StatusCode::OK)

View File

@@ -11,7 +11,6 @@ use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
use pin_project_lite::pin_project;
use std::{
net::SocketAddr,
pin::Pin,
task::{ready, Context, Poll},
};
@@ -133,7 +132,6 @@ pub async fn serve_websocket(
cancel_map: &CancelMap,
session_id: uuid::Uuid,
hostname: Option<String>,
peer_addr: SocketAddr,
) -> anyhow::Result<()> {
let websocket = websocket.await?;
handle_client(
@@ -142,7 +140,6 @@ pub async fn serve_websocket(
session_id,
WebSocketRw::new(websocket),
ClientMode::Websockets { hostname },
peer_addr,
)
.await?;
Ok(())

View File

@@ -6,7 +6,6 @@ use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_S
use dashmap::{mapref::entry::Entry, DashMap};
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use smol_str::SmolStr;
use std::{
convert::Infallible,
sync::{
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
/// because we enrich the event with project_id in the control-plane endpoint.
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
pub struct Ids {
pub endpoint_id: SmolStr,
pub branch_id: SmolStr,
pub endpoint_id: String,
pub branch_id: String,
}
#[derive(Debug)]
@@ -291,8 +290,8 @@ mod tests {
// register a new counter
let counter = metrics.register(Ids {
endpoint_id: "e1".into(),
branch_id: "b1".into(),
endpoint_id: "e1".to_string(),
branch_id: "b1".to_string(),
});
// the counter should be observed despite 0 egress

View File

@@ -34,7 +34,7 @@ types-psutil = "^5.9.5.12"
types-toml = "^0.10.8.6"
pytest-httpserver = "^1.0.8"
aiohttp = "3.9.0"
pytest-rerunfailures = "^13.0"
pytest-rerunfailures = "^11.1.2"
types-pytest-lazy-fixture = "^0.6.3.3"
pytest-split = "^0.8.1"
zstandard = "^0.21.0"

View File

@@ -6,6 +6,8 @@ license.workspace = true
[dependencies]
aws-sdk-s3.workspace = true
aws-smithy-http.workspace = true
aws-types.workspace = true
either.workspace = true
tokio-rustls.workspace = true
anyhow.workspace = true
@@ -28,7 +30,7 @@ itertools.workspace = true
tokio = { workspace = true, features = ["macros", "rt-multi-thread"] }
chrono = { workspace = true, default-features = false, features = ["clock", "serde"] }
reqwest = { workspace = true, default-features = false, features = ["rustls-tls", "json"] }
aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] }
aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] }
pageserver = { path = "../pageserver" }
remote_storage = { path = "../libs/remote_storage" }

View File

@@ -250,7 +250,10 @@ pub(crate) async fn list_timeline_blobs(
pin_mut!(stream);
while let Some(obj) = stream.next().await {
let obj = obj?;
let key = obj.key();
let key = match obj.key() {
Some(k) => k,
None => continue,
};
let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
match blob_name {
@@ -283,7 +286,7 @@ pub(crate) async fn list_timeline_blobs(
let (index_part_object, index_part_generation) = match index_parts
.iter()
.filter_map(|k| {
let key = k.key();
let key = k.key().unwrap();
// Stripping the index key to the last part, because RemotePath doesn't
// like absolute paths, and depending on prefix_in_bucket it's possible
// for the keys we read back to start with a slash.
@@ -304,7 +307,8 @@ pub(crate) async fn list_timeline_blobs(
errors.push("S3 list response got no index_part.json file".to_string());
}
if let Some(index_part_object_key) = index_part_object.as_ref().map(|object| object.key()) {
if let Some(index_part_object_key) = index_part_object.as_ref().and_then(|object| object.key())
{
let index_part_bytes = download_object_with_retries(
s3_client,
&timeline_dir_target.bucket_name,

Some files were not shown because too many files have changed in this diff Show More