Merge remote-tracking branch 'origin/main' into arpad/lsn_by_ts

This commit is contained in:
Arpad Müller
2023-11-07 05:40:46 +01:00
103 changed files with 5783 additions and 3587 deletions

23
Cargo.lock generated
View File

@@ -3550,7 +3550,7 @@ dependencies = [
[[package]]
name = "postgres"
version = "0.19.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
dependencies = [
"bytes",
"fallible-iterator",
@@ -3563,7 +3563,7 @@ dependencies = [
[[package]]
name = "postgres-native-tls"
version = "0.5.0"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
dependencies = [
"native-tls",
"tokio",
@@ -3574,7 +3574,7 @@ dependencies = [
[[package]]
name = "postgres-protocol"
version = "0.6.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
dependencies = [
"base64 0.20.0",
"byteorder",
@@ -3592,7 +3592,7 @@ dependencies = [
[[package]]
name = "postgres-types"
version = "0.2.4"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
dependencies = [
"bytes",
"fallible-iterator",
@@ -4419,6 +4419,7 @@ dependencies = [
"itertools",
"pageserver",
"rand 0.8.5",
"remote_storage",
"reqwest",
"serde",
"serde_json",
@@ -4477,6 +4478,7 @@ dependencies = [
"tokio",
"tokio-io-timeout",
"tokio-postgres",
"tokio-stream",
"toml_edit",
"tracing",
"url",
@@ -4679,6 +4681,16 @@ dependencies = [
"serde_derive",
]
[[package]]
name = "serde_assert"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eda563240c1288b044209be1f0d38bb4d15044fb3e00dc354fbc922ab4733e80"
dependencies = [
"hashbrown 0.13.2",
"serde",
]
[[package]]
name = "serde_derive"
version = "1.0.183"
@@ -5396,7 +5408,7 @@ dependencies = [
[[package]]
name = "tokio-postgres"
version = "0.7.7"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=7434d9388965a17a6d113e5dfc0e65666a03b4c2#7434d9388965a17a6d113e5dfc0e65666a03b4c2"
source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
dependencies = [
"async-trait",
"byteorder",
@@ -5965,6 +5977,7 @@ dependencies = [
"routerify",
"sentry",
"serde",
"serde_assert",
"serde_json",
"serde_with",
"signal-hook",

View File

@@ -124,6 +124,7 @@ sentry = { version = "0.31", default-features = false, features = ["backtrace",
serde = { version = "1.0", features = ["derive"] }
serde_json = "1"
serde_with = "2.0"
serde_assert = "0.5.0"
sha2 = "0.10.2"
signal-hook = "0.3"
smallvec = "1.11"
@@ -161,11 +162,11 @@ env_logger = "0.10"
log = "0.4"
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -202,7 +203,7 @@ tonic-build = "0.9"
# This is only needed for proxy's tests.
# TODO: we should probably fork `tokio-postgres-rustls` instead.
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="7434d9388965a17a6d113e5dfc0e65666a03b4c2" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
################# Binary contents sections

View File

@@ -278,32 +278,26 @@ fn main() -> Result<()> {
if #[cfg(target_os = "linux")] {
use std::env;
use tokio_util::sync::CancellationToken;
use tracing::warn;
let vm_monitor_addr = matches.get_one::<String>("vm-monitor-addr");
let vm_monitor_addr = matches
.get_one::<String>("vm-monitor-addr")
.expect("--vm-monitor-addr should always be set because it has a default arg");
let file_cache_connstr = matches.get_one::<String>("filecache-connstr");
let cgroup = matches.get_one::<String>("cgroup");
let file_cache_on_disk = matches.get_flag("file-cache-on-disk");
// Only make a runtime if we need to.
// Note: it seems like you can make a runtime in an inner scope and
// if you start a task in it it won't be dropped. However, make it
// in the outermost scope just to be safe.
let rt = match (env::var_os("AUTOSCALING"), vm_monitor_addr) {
(None, None) => None,
(None, Some(_)) => {
warn!("--vm-monitor-addr option set but AUTOSCALING env var not present");
None
}
(Some(_), None) => {
panic!("AUTOSCALING env var present but --vm-monitor-addr option not set")
}
(Some(_), Some(_)) => Some(
let rt = if env::var_os("AUTOSCALING").is_some() {
Some(
tokio::runtime::Builder::new_multi_thread()
.worker_threads(4)
.enable_all()
.build()
.expect("failed to create tokio runtime for monitor"),
),
.expect("failed to create tokio runtime for monitor")
)
} else {
None
};
// This token is used internally by the monitor to clean up all threads
@@ -314,8 +308,7 @@ fn main() -> Result<()> {
Box::leak(Box::new(vm_monitor::Args {
cgroup: cgroup.cloned(),
pgconnstr: file_cache_connstr.cloned(),
addr: vm_monitor_addr.cloned().unwrap(),
file_cache_on_disk,
addr: vm_monitor_addr.clone(),
})),
token.clone(),
))
@@ -487,6 +480,8 @@ fn cli() -> clap::Command {
.value_name("FILECACHE_CONNSTR"),
)
.arg(
// DEPRECATED, NO LONGER DOES ANYTHING.
// See https://github.com/neondatabase/cloud/issues/7516
Arg::new("file-cache-on-disk")
.long("file-cache-on-disk")
.action(clap::ArgAction::SetTrue),

View File

@@ -24,7 +24,7 @@ fn do_control_plane_request(
) -> Result<ControlPlaneSpecResponse, (bool, String)> {
let resp = reqwest::blocking::Client::new()
.get(uri)
.header("Authorization", jwt)
.header("Authorization", format!("Bearer {}", jwt))
.send()
.map_err(|e| {
(
@@ -68,7 +68,7 @@ pub fn get_spec_from_control_plane(
base_uri: &str,
compute_id: &str,
) -> Result<Option<ComputeSpec>> {
let cp_uri = format!("{base_uri}/management/api/v2/computes/{compute_id}/spec");
let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
Ok(v) => v,
Err(_) => "".to_string(),

View File

@@ -2,7 +2,6 @@ use crate::{background_process, local_env::LocalEnv};
use anyhow::anyhow;
use camino::Utf8PathBuf;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::{path::PathBuf, process::Child};
use utils::id::{NodeId, TenantId};
@@ -14,10 +13,8 @@ pub struct AttachmentService {
const COMMAND: &str = "attachment_service";
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
pub node_id: Option<NodeId>,
}

View File

@@ -12,6 +12,7 @@ use hyper::{Body, Request, Response};
use serde::{Deserialize, Serialize};
use std::path::{Path, PathBuf};
use std::{collections::HashMap, sync::Arc};
use utils::http::endpoint::request_span;
use utils::logging::{self, LogFormat};
use utils::signals::{ShutdownSignals, Signal};
@@ -221,8 +222,25 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
generation: 0,
});
if attach_req.node_id.is_some() {
if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
tenant_state.generation += 1;
tracing::info!(
tenant_id = %attach_req.tenant_id,
ps_id = %attaching_pageserver,
generation = %tenant_state.generation,
"issuing",
);
} else if let Some(ps_id) = tenant_state.pageserver {
tracing::info!(
tenant_id = %attach_req.tenant_id,
%ps_id,
generation = %tenant_state.generation,
"dropping",
);
} else {
tracing::info!(
tenant_id = %attach_req.tenant_id,
"no-op: tenant already has no pageserver");
}
tenant_state.pageserver = attach_req.node_id;
let generation = tenant_state.generation;
@@ -240,9 +258,9 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
endpoint::make_router()
.data(Arc::new(State::new(persistent_state)))
.post("/re-attach", handle_re_attach)
.post("/validate", handle_validate)
.post("/attach-hook", handle_attach_hook)
.post("/re-attach", |r| request_span(r, handle_re_attach))
.post("/validate", |r| request_span(r, handle_validate))
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
}
#[tokio::main]

View File

@@ -46,7 +46,6 @@ use std::time::Duration;
use anyhow::{anyhow, bail, Context, Result};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{NodeId, TenantId, TimelineId};
use crate::local_env::LocalEnv;
@@ -57,13 +56,10 @@ use compute_api::responses::{ComputeState, ComputeStatus};
use compute_api::spec::{Cluster, ComputeMode, ComputeSpec};
// contents of a endpoint.json file
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EndpointConf {
endpoint_id: String,
#[serde_as(as = "DisplayFromStr")]
tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
timeline_id: TimelineId,
mode: ComputeMode,
pg_port: u16,

View File

@@ -8,7 +8,6 @@ use anyhow::{bail, ensure, Context};
use postgres_backend::AuthType;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap;
use std::env;
use std::fs;
@@ -33,7 +32,6 @@ pub const DEFAULT_PG_VERSION: u32 = 15;
// to 'neon_local init --config=<path>' option. See control_plane/simple.conf for
// an example.
//
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct LocalEnv {
// Base directory for all the nodes (the pageserver, safekeepers and
@@ -59,7 +57,6 @@ pub struct LocalEnv {
// Default tenant ID to use with the 'neon_local' command line utility, when
// --tenant_id is not explicitly specified.
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub default_tenant_id: Option<TenantId>,
// used to issue tokens during e.g pg start
@@ -84,7 +81,6 @@ pub struct LocalEnv {
// A `HashMap<String, HashMap<TenantId, TimelineId>>` would be more appropriate here,
// but deserialization into a generic toml object as `toml::Value::try_from` fails with an error.
// https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table".
#[serde_as(as = "HashMap<_, Vec<(DisplayFromStr, DisplayFromStr)>>")]
branch_name_mappings: HashMap<String, Vec<(TenantId, TimelineId)>>,
}

108
docs/updating-postgres.md Normal file
View File

@@ -0,0 +1,108 @@
# Updating Postgres
## Minor Versions
When upgrading to a new minor version of Postgres, please follow these steps:
_Example: 15.4 is the new minor version to upgrade to from 15.3._
1. Clone the Neon Postgres repository if you have not done so already.
```shell
git clone git@github.com:neondatabase/postgres.git
```
1. Add the Postgres upstream remote.
```shell
git remote add upstream https://git.postgresql.org/git/postgresql.git
```
1. Create a new branch based on the stable branch you are updating.
```shell
git checkout -b my-branch REL_15_STABLE_neon
```
1. Tag the last commit on the stable branch you are updating.
```shell
git tag REL_15_3_neon
```
1. Push the new tag to the Neon Postgres repository.
```shell
git push origin REL_15_3_neon
```
1. Find the release tags you're looking for. They are of the form `REL_X_Y`.
1. Rebase the branch you created on the tag and resolve any conflicts.
```shell
git fetch upstream REL_15_4
git rebase REL_15_4
```
1. Run the Postgres test suite to make sure our commits have not affected
Postgres in a negative way.
```shell
make check
# OR
meson test -C builddir
```
1. Push your branch to the Neon Postgres repository.
```shell
git push origin my-branch
```
1. Clone the Neon repository if you have not done so already.
```shell
git clone git@github.com:neondatabase/neon.git
```
1. Create a new branch.
1. Change the `revisions.json` file to point at the HEAD of your Postgres
branch.
1. Update the Git submodule.
```shell
git submodule set-branch --branch my-branch vendor/postgres-v15
git submodule update --remote vendor/postgres-v15
```
1. Run the Neon test suite to make sure that Neon is still good to go on this
minor Postgres release.
```shell
./scripts/poetry -k pg15
```
1. Commit your changes.
1. Create a pull request, and wait for CI to go green.
1. Force push the rebased Postgres branches into the Neon Postgres repository.
```shell
git push --force origin my-branch:REL_15_STABLE_neon
```
It may require disabling various branch protections.
1. Update your Neon PR to point at the branches.
```shell
git submodule set-branch --branch REL_15_STABLE_neon vendor/postgres-v15
git commit --amend --no-edit
git push --force origin
```
1. Merge the pull request after getting approval(s) and CI completion.

View File

@@ -6,7 +6,6 @@
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
@@ -19,7 +18,6 @@ pub type PgIdent = String;
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[serde_as]
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct ComputeSpec {
pub format_version: f32,
@@ -50,12 +48,12 @@ pub struct ComputeSpec {
// these, and instead set the "neon.tenant_id", "neon.timeline_id",
// etc. GUCs in cluster.settings. TODO: Once the control plane has been
// updated to fill these fields, we can make these non optional.
#[serde_as(as = "Option<DisplayFromStr>")]
pub tenant_id: Option<TenantId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub pageserver_connstring: Option<String>,
#[serde(default)]
pub safekeeper_connstrings: Vec<String>,
@@ -140,14 +138,13 @@ impl RemoteExtSpec {
}
}
#[serde_as]
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeMode {
/// A read-write node
#[default]
Primary,
/// A read-only node, pinned at a particular LSN
Static(#[serde_as(as = "DisplayFromStr")] Lsn),
Static(Lsn),
/// A read-only node that follows the tip of the branch in hot standby mode
///
/// Future versions may want to distinguish between replicas with hot standby

View File

@@ -4,7 +4,6 @@
//! See docs/rfcs/025-generation-numbers.md
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::{NodeId, TenantId};
#[derive(Serialize, Deserialize)]
@@ -12,10 +11,8 @@ pub struct ReAttachRequest {
pub node_id: NodeId,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct ReAttachResponseTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
pub gen: u32,
}
@@ -25,10 +22,8 @@ pub struct ReAttachResponse {
pub tenants: Vec<ReAttachResponseTenant>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct ValidateRequestTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
pub gen: u32,
}
@@ -43,10 +38,8 @@ pub struct ValidateResponse {
pub tenants: Vec<ValidateResponseTenant>,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct ValidateResponseTenant {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
pub valid: bool,
}

View File

@@ -6,7 +6,7 @@ use std::{
use byteorder::{BigEndian, ReadBytesExt};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use serde_with::serde_as;
use strum_macros;
use utils::{
completion,
@@ -174,25 +174,19 @@ pub enum TimelineState {
Broken { reason: String, backtrace: String },
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub new_timeline_id: TimelineId,
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_timeline_id: Option<TimelineId>,
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_start_lsn: Option<Lsn>,
pub pg_version: Option<u32>,
}
#[serde_as]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub new_tenant_id: TenantId,
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
@@ -201,7 +195,6 @@ pub struct TenantCreateRequest {
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[serde_as]
#[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLoadRequest {
@@ -278,31 +271,26 @@ pub struct LocationConfig {
pub tenant_conf: TenantConfig,
}
#[serde_as]
#[derive(Serialize, Deserialize)]
#[serde(transparent)]
pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub TenantId);
pub struct TenantCreateResponse(pub TenantId);
#[derive(Serialize)]
pub struct StatusResponse {
pub id: NodeId,
}
#[serde_as]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLocationConfigRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde(flatten)]
pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[serde_as]
#[derive(Serialize, Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantConfigRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde(flatten)]
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -374,10 +362,8 @@ pub enum TenantAttachmentStatus {
Failed { reason: String },
}
#[serde_as]
#[derive(Serialize, Deserialize, Clone)]
pub struct TenantInfo {
#[serde_as(as = "DisplayFromStr")]
pub id: TenantId,
// NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
pub state: TenantState,
@@ -388,33 +374,22 @@ pub struct TenantInfo {
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId,
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_timeline_id: Option<TimelineId>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub ancestor_lsn: Option<Lsn>,
#[serde_as(as = "DisplayFromStr")]
pub last_record_lsn: Lsn,
#[serde_as(as = "Option<DisplayFromStr>")]
pub prev_record_lsn: Option<Lsn>,
#[serde_as(as = "DisplayFromStr")]
pub latest_gc_cutoff_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn,
/// The LSN that we have succesfully uploaded to remote storage
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
/// The LSN that we are advertizing to safekeepers
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn_visible: Lsn,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
@@ -426,7 +401,6 @@ pub struct TimelineInfo {
pub timeline_dir_layer_file_size_sum: Option<u64>,
pub wal_source_connstr: Option<String>,
#[serde_as(as = "Option<DisplayFromStr>")]
pub last_received_msg_lsn: Option<Lsn>,
/// the timestamp (in microseconds) of the last received message
pub last_received_msg_ts: Option<u128>,
@@ -523,23 +497,13 @@ pub struct LayerAccessStats {
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
}
#[serde_as]
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "kind")]
pub enum InMemoryLayerInfo {
Open {
#[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn,
},
Frozen {
#[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn,
#[serde_as(as = "DisplayFromStr")]
lsn_end: Lsn,
},
Open { lsn_start: Lsn },
Frozen { lsn_start: Lsn, lsn_end: Lsn },
}
#[serde_as]
#[derive(Debug, Clone, Serialize)]
#[serde(tag = "kind")]
pub enum HistoricLayerInfo {
@@ -547,9 +511,7 @@ pub enum HistoricLayerInfo {
layer_file_name: String,
layer_file_size: u64,
#[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn,
#[serde_as(as = "DisplayFromStr")]
lsn_end: Lsn,
remote: bool,
access_stats: LayerAccessStats,
@@ -558,7 +520,6 @@ pub enum HistoricLayerInfo {
layer_file_name: String,
layer_file_size: u64,
#[serde_as(as = "DisplayFromStr")]
lsn_start: Lsn,
remote: bool,
access_stats: LayerAccessStats,

View File

@@ -728,12 +728,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
trace!("got query {query_string:?}");
if let Err(e) = handler.process_query(self, query_string).await {
log_query_error(query_string, &e);
let short_error = short_error(&e);
self.write_message_noflush(&BeMessage::ErrorResponse(
&short_error,
Some(e.pg_error_code()),
))?;
match e {
QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
e => {
log_query_error(query_string, &e);
let short_error = short_error(&e);
self.write_message_noflush(&BeMessage::ErrorResponse(
&short_error,
Some(e.pg_error_code()),
))?;
}
}
}
self.write_message_noflush(&BeMessage::ReadyForQuery)?;
}

View File

@@ -1,21 +1,18 @@
//! Azure Blob Storage wrapper
use std::collections::HashMap;
use std::env;
use std::num::NonZeroU32;
use std::sync::Arc;
use std::{borrow::Cow, collections::HashMap, io::Cursor};
use std::{borrow::Cow, io::Cursor};
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result;
use azure_core::request_options::{MaxResults, Metadata, Range};
use azure_core::Header;
use azure_identity::DefaultAzureCredential;
use azure_storage::StorageCredentials;
use azure_storage_blobs::prelude::ClientBuilder;
use azure_storage_blobs::{
blob::operations::GetBlobBuilder,
prelude::{BlobClient, ContainerClient},
};
use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
use futures_util::StreamExt;
use http_types::StatusCode;
use tokio::io::AsyncRead;
@@ -112,16 +109,19 @@ impl AzureBlobStorage {
async fn download_for_builder(
&self,
metadata: StorageMetadata,
builder: GetBlobBuilder,
) -> Result<Download, DownloadError> {
let mut response = builder.into_stream();
let mut metadata = HashMap::new();
// TODO give proper streaming response instead of buffering into RAM
// https://github.com/neondatabase/neon/issues/5563
let mut buf = Vec::new();
while let Some(part) = response.next().await {
let part = part.map_err(to_download_error)?;
if let Some(blob_meta) = part.blob.metadata {
metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
}
let data = part
.data
.collect()
@@ -131,28 +131,9 @@ impl AzureBlobStorage {
}
Ok(Download {
download_stream: Box::pin(Cursor::new(buf)),
metadata: Some(metadata),
metadata: Some(StorageMetadata(metadata)),
})
}
// TODO get rid of this function once we have metadata included in the response
// https://github.com/Azure/azure-sdk-for-rust/issues/1439
async fn get_metadata(
&self,
blob_client: &BlobClient,
) -> Result<StorageMetadata, DownloadError> {
let builder = blob_client.get_metadata();
let response = builder.into_future().await.map_err(to_download_error)?;
let mut map = HashMap::new();
for md in response.metadata.iter() {
map.insert(
md.name().as_str().to_string(),
md.value().as_str().to_string(),
);
}
Ok(StorageMetadata(map))
}
async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
self.concurrency_limiter
@@ -269,11 +250,9 @@ impl RemoteStorage for AzureBlobStorage {
let _permit = self.permit(RequestKind::Get).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
let metadata = self.get_metadata(&blob_client).await?;
let builder = blob_client.get();
self.download_for_builder(metadata, builder).await
self.download_for_builder(builder).await
}
async fn download_byte_range(
@@ -285,8 +264,6 @@ impl RemoteStorage for AzureBlobStorage {
let _permit = self.permit(RequestKind::Get).await;
let blob_client = self.client.blob_client(self.relative_path_to_name(from));
let metadata = self.get_metadata(&blob_client).await?;
let mut builder = blob_client.get();
if let Some(end_exclusive) = end_exclusive {
@@ -301,7 +278,7 @@ impl RemoteStorage for AzureBlobStorage {
builder = builder.range(Range::new(start_inclusive, end_exclusive));
}
self.download_for_builder(metadata, builder).await
self.download_for_builder(builder).await
}
async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {

View File

@@ -1,23 +1,18 @@
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::{
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
};
#[serde_as]
#[derive(Serialize, Deserialize)]
pub struct TimelineCreateRequest {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId,
pub peer_ids: Option<Vec<NodeId>>,
pub pg_version: u32,
pub system_id: Option<u64>,
pub wal_seg_size: Option<u32>,
#[serde_as(as = "DisplayFromStr")]
pub commit_lsn: Lsn,
// If not passed, it is assigned to the beginning of commit_lsn segment.
pub local_start_lsn: Option<Lsn>,
@@ -28,7 +23,6 @@ fn lsn_invalid() -> Lsn {
}
/// Data about safekeeper's timeline, mirrors broker.proto.
#[serde_as]
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct SkTimelineInfo {
/// Term.
@@ -36,25 +30,19 @@ pub struct SkTimelineInfo {
/// Term of the last entry.
pub last_log_term: Option<u64>,
/// LSN of the last record.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub flush_lsn: Lsn,
/// Up to which LSN safekeeper regards its WAL as committed.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub commit_lsn: Lsn,
/// LSN up to which safekeeper has backed WAL.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub backup_lsn: Lsn,
/// LSN of last checkpoint uploaded by pageserver.
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub remote_consistent_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub peer_horizon_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
#[serde(default = "lsn_invalid")]
pub local_start_lsn: Lsn,
/// A connection string to use for WAL receiving.

View File

@@ -55,6 +55,7 @@ bytes.workspace = true
criterion.workspace = true
hex-literal.workspace = true
camino-tempfile.workspace = true
serde_assert.workspace = true
[[bench]]
name = "benchmarks"

View File

@@ -9,7 +9,6 @@ use jsonwebtoken::{
decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use crate::id::TenantId;
@@ -32,11 +31,9 @@ pub enum Scope {
}
/// JWT payload. See docs/authentication.md for the format
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
pub struct Claims {
#[serde(default)]
#[serde_as(as = "Option<DisplayFromStr>")]
pub tenant_id: Option<TenantId>,
pub scope: Scope,
}

View File

@@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
///
/// See docs/rfcs/025-generation-numbers.md for detail on how generation
/// numbers are used.
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub enum Generation {
// Generations with this magic value will not add a suffix to S3 keys, and will not
// be included in persisted index_part.json. This value is only to be used

41
libs/utils/src/hex.rs Normal file
View File

@@ -0,0 +1,41 @@
/// Useful type for asserting that expected bytes match reporting the bytes more readable
/// array-syntax compatible hex bytes.
///
/// # Usage
///
/// ```
/// use utils::Hex;
///
/// let actual = serialize_something();
/// let expected = [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64];
///
/// // the type implements PartialEq and on mismatch, both sides are printed in 16 wide multiline
/// // output suffixed with an array style length for easier comparisons.
/// assert_eq!(Hex(&actual), Hex(&expected));
///
/// // with `let expected = [0x68];` the error would had been:
/// // assertion `left == right` failed
/// // left: [0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64; 11]
/// // right: [0x68; 1]
/// # fn serialize_something() -> Vec<u8> { "hello world".as_bytes().to_vec() }
/// ```
#[derive(PartialEq)]
pub struct Hex<'a>(pub &'a [u8]);
impl std::fmt::Debug for Hex<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "[")?;
for (i, c) in self.0.chunks(16).enumerate() {
if i > 0 && !c.is_empty() {
writeln!(f, ", ")?;
}
for (j, b) in c.iter().enumerate() {
if j > 0 {
write!(f, ", ")?;
}
write!(f, "0x{b:02x}")?;
}
}
write!(f, "; {}]", self.0.len())
}
}

View File

@@ -14,6 +14,11 @@ use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future;
use std::str::FromStr;
use bytes::{Bytes, BytesMut};
use std::io::Write as _;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"libmetrics_metric_handler_requests_total",
@@ -146,94 +151,89 @@ impl Drop for RequestCancelled {
}
}
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
pub struct ChannelWriter {
buffer: BytesMut,
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
}
impl ChannelWriter {
pub fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
assert_ne!(buf_len, 0);
ChannelWriter {
// split about half off the buffer from the start, because we flush depending on
// capacity. first flush will come sooner than without this, but now resizes will
// have better chance of picking up the "other" half. not guaranteed of course.
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
}
}
pub fn flush0(&mut self) -> std::io::Result<usize> {
let n = self.buffer.len();
if n == 0 {
return Ok(0);
}
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
// throttle sending to allow reuse of our buffer in `write`.
self.tx.reserve().await.map_err(|_| ())?;
// now the response task has picked up the buffer and hopefully started
// sending it to the client.
Ok(())
});
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
self.written += n;
Ok(n)
}
pub fn flushed_bytes(&self) -> usize {
self.written
}
}
impl std::io::Write for ChannelWriter {
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
let remaining = self.buffer.capacity() - self.buffer.len();
let out_of_space = remaining < buf.len();
let original_len = buf.len();
if out_of_space {
let can_still_fit = buf.len() - remaining;
self.buffer.extend_from_slice(&buf[..can_still_fit]);
buf = &buf[can_still_fit..];
self.flush0()?;
}
// assume that this will often under normal operation just move the pointer back to the
// beginning of allocation, because previous split off parts are already sent and
// dropped.
self.buffer.extend_from_slice(buf);
Ok(original_len)
}
fn flush(&mut self) -> std::io::Result<()> {
self.flush0().map(|_| ())
}
}
async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
use bytes::{Bytes, BytesMut};
use std::io::Write as _;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
SERVE_METRICS_COUNT.inc();
/// An [`std::io::Write`] implementation on top of a channel sending [`bytes::Bytes`] chunks.
struct ChannelWriter {
buffer: BytesMut,
tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
}
impl ChannelWriter {
fn new(buf_len: usize, tx: mpsc::Sender<std::io::Result<Bytes>>) -> Self {
assert_ne!(buf_len, 0);
ChannelWriter {
// split about half off the buffer from the start, because we flush depending on
// capacity. first flush will come sooner than without this, but now resizes will
// have better chance of picking up the "other" half. not guaranteed of course.
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
}
}
fn flush0(&mut self) -> std::io::Result<usize> {
let n = self.buffer.len();
if n == 0 {
return Ok(0);
}
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
self.tx.send(Ok(ready)).await.map_err(|_| ())?;
// throttle sending to allow reuse of our buffer in `write`.
self.tx.reserve().await.map_err(|_| ())?;
// now the response task has picked up the buffer and hopefully started
// sending it to the client.
Ok(())
});
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
self.written += n;
Ok(n)
}
fn flushed_bytes(&self) -> usize {
self.written
}
}
impl std::io::Write for ChannelWriter {
fn write(&mut self, mut buf: &[u8]) -> std::io::Result<usize> {
let remaining = self.buffer.capacity() - self.buffer.len();
let out_of_space = remaining < buf.len();
let original_len = buf.len();
if out_of_space {
let can_still_fit = buf.len() - remaining;
self.buffer.extend_from_slice(&buf[..can_still_fit]);
buf = &buf[can_still_fit..];
self.flush0()?;
}
// assume that this will often under normal operation just move the pointer back to the
// beginning of allocation, because previous split off parts are already sent and
// dropped.
self.buffer.extend_from_slice(buf);
Ok(original_len)
}
fn flush(&mut self) -> std::io::Result<()> {
self.flush0().map(|_| ())
}
}
let started_at = std::time::Instant::now();
let (tx, rx) = mpsc::channel(1);

View File

@@ -3,6 +3,7 @@ use std::{fmt, str::FromStr};
use anyhow::Context;
use hex::FromHex;
use rand::Rng;
use serde::de::Visitor;
use serde::{Deserialize, Serialize};
use thiserror::Error;
@@ -17,12 +18,74 @@ pub enum IdError {
///
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
///
/// Use `#[serde_as(as = "DisplayFromStr")]` to (de)serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`.
/// Check the `serde_with::serde_as` documentation for options for more complex types.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
struct Id([u8; 16]);
impl Serialize for Id {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
self.0.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for Id {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct IdVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> Visitor<'de> for IdVisitor {
type Value = Id;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str("value in form of hex string")
} else {
formatter.write_str("value in form of integer array([u8; 16])")
}
}
fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
where
A: serde::de::SeqAccess<'de>,
{
let s = serde::de::value::SeqAccessDeserializer::new(seq);
let id: [u8; 16] = Deserialize::deserialize(s)?;
Ok(Id::from(id))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Id::from_str(v).map_err(E::custom)
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(IdVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_tuple(
16,
IdVisitor {
is_human_readable_deserializer: false,
},
)
}
}
}
impl Id {
pub fn get_from_buf(buf: &mut impl bytes::Buf) -> Id {
let mut arr = [0u8; 16];
@@ -308,3 +371,112 @@ impl fmt::Display for NodeId {
write!(f, "{}", self.0)
}
}
#[cfg(test)]
mod tests {
use serde_assert::{Deserializer, Serializer, Token, Tokens};
use crate::bin_ser::BeSer;
use super::*;
#[test]
fn test_id_serde_non_human_readable() {
let original_id = Id([
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
]);
let expected_tokens = Tokens(vec![
Token::Tuple { len: 16 },
Token::U8(173),
Token::U8(80),
Token::U8(132),
Token::U8(115),
Token::U8(129),
Token::U8(226),
Token::U8(72),
Token::U8(254),
Token::U8(170),
Token::U8(201),
Token::U8(135),
Token::U8(108),
Token::U8(199),
Token::U8(26),
Token::U8(228),
Token::U8(24),
Token::TupleEnd,
]);
let serializer = Serializer::builder().is_human_readable(false).build();
let serialized_tokens = original_id.serialize(&serializer).unwrap();
assert_eq!(serialized_tokens, expected_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(serialized_tokens)
.build();
let deserialized_id = Id::deserialize(&mut deserializer).unwrap();
assert_eq!(deserialized_id, original_id);
}
#[test]
fn test_id_serde_human_readable() {
let original_id = Id([
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
]);
let expected_tokens = Tokens(vec![Token::Str(String::from(
"ad50847381e248feaac9876cc71ae418",
))]);
let serializer = Serializer::builder().is_human_readable(true).build();
let serialized_tokens = original_id.serialize(&serializer).unwrap();
assert_eq!(serialized_tokens, expected_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(true)
.tokens(Tokens(vec![Token::Str(String::from(
"ad50847381e248feaac9876cc71ae418",
))]))
.build();
assert_eq!(Id::deserialize(&mut deserializer).unwrap(), original_id);
}
macro_rules! roundtrip_type {
($type:ty, $expected_bytes:expr) => {{
let expected_bytes: [u8; 16] = $expected_bytes;
let original_id = <$type>::from(expected_bytes);
let ser_bytes = original_id.ser().unwrap();
assert_eq!(ser_bytes, expected_bytes);
let des_id = <$type>::des(&ser_bytes).unwrap();
assert_eq!(des_id, original_id);
}};
}
#[test]
fn test_id_bincode_serde() {
let expected_bytes = [
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
];
roundtrip_type!(Id, expected_bytes);
}
#[test]
fn test_tenant_id_bincode_serde() {
let expected_bytes = [
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
];
roundtrip_type!(TenantId, expected_bytes);
}
#[test]
fn test_timeline_id_bincode_serde() {
let expected_bytes = [
173, 80, 132, 115, 129, 226, 72, 254, 170, 201, 135, 108, 199, 26, 228, 24,
];
roundtrip_type!(TimelineId, expected_bytes);
}
}

View File

@@ -24,6 +24,10 @@ pub mod auth;
// utility functions and helper traits for unified unique id generation/serialization etc.
pub mod id;
mod hex;
pub use hex::Hex;
// http endpoint utils
pub mod http;
@@ -74,6 +78,8 @@ pub mod completion;
pub mod error;
pub mod exp_counter;
/// async timeout helper
pub mod timeout;
pub mod sync;

View File

@@ -1,7 +1,7 @@
#![warn(missing_docs)]
use camino::Utf8Path;
use serde::{Deserialize, Serialize};
use serde::{de::Visitor, Deserialize, Serialize};
use std::fmt;
use std::ops::{Add, AddAssign};
use std::str::FromStr;
@@ -13,10 +13,114 @@ use crate::seqwait::MonotonicCounter;
pub const XLOG_BLCKSZ: u32 = 8192;
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash, Serialize, Deserialize)]
#[serde(transparent)]
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
pub struct Lsn(pub u64);
impl Serialize for Lsn {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if serializer.is_human_readable() {
serializer.collect_str(self)
} else {
self.0.serialize(serializer)
}
}
}
impl<'de> Deserialize<'de> for Lsn {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
struct LsnVisitor {
is_human_readable_deserializer: bool,
}
impl<'de> Visitor<'de> for LsnVisitor {
type Value = Lsn;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
if self.is_human_readable_deserializer {
formatter.write_str(
"value in form of hex string({upper_u32_hex}/{lower_u32_hex}) representing u64 integer",
)
} else {
formatter.write_str("value in form of integer(u64)")
}
}
fn visit_u64<E>(self, v: u64) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Ok(Lsn(v))
}
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
where
E: serde::de::Error,
{
Lsn::from_str(v).map_err(|e| E::custom(e))
}
}
if deserializer.is_human_readable() {
deserializer.deserialize_str(LsnVisitor {
is_human_readable_deserializer: true,
})
} else {
deserializer.deserialize_u64(LsnVisitor {
is_human_readable_deserializer: false,
})
}
}
}
/// Allows (de)serialization of an `Lsn` always as `u64`.
///
/// ### Example
///
/// ```rust
/// # use serde::{Serialize, Deserialize};
/// use utils::lsn::Lsn;
///
/// #[derive(PartialEq, Serialize, Deserialize, Debug)]
/// struct Foo {
/// #[serde(with = "utils::lsn::serde_as_u64")]
/// always_u64: Lsn,
/// }
///
/// let orig = Foo { always_u64: Lsn(1234) };
///
/// let res = serde_json::to_string(&orig).unwrap();
/// assert_eq!(res, r#"{"always_u64":1234}"#);
///
/// let foo = serde_json::from_str::<Foo>(&res).unwrap();
/// assert_eq!(foo, orig);
/// ```
///
pub mod serde_as_u64 {
use super::Lsn;
/// Serializes the Lsn as u64 disregarding the human readability of the format.
///
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(serialize_with = "...")]`.
pub fn serialize<S: serde::Serializer>(lsn: &Lsn, serializer: S) -> Result<S::Ok, S::Error> {
use serde::Serialize;
lsn.0.serialize(serializer)
}
/// Deserializes the Lsn as u64 disregarding the human readability of the format.
///
/// Meant to be used via `#[serde(with = "...")]` or `#[serde(deserialize_with = "...")]`.
pub fn deserialize<'de, D: serde::Deserializer<'de>>(deserializer: D) -> Result<Lsn, D::Error> {
use serde::Deserialize;
u64::deserialize(deserializer).map(Lsn)
}
}
/// We tried to parse an LSN from a string, but failed
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[error("LsnParseError")]
@@ -264,8 +368,13 @@ impl MonotonicCounter<Lsn> for RecordLsn {
#[cfg(test)]
mod tests {
use crate::bin_ser::BeSer;
use super::*;
use serde::ser::Serialize;
use serde_assert::{Deserializer, Serializer, Token, Tokens};
#[test]
fn test_lsn_strings() {
assert_eq!("12345678/AAAA5555".parse(), Ok(Lsn(0x12345678AAAA5555)));
@@ -341,4 +450,95 @@ mod tests {
assert_eq!(lsn.fetch_max(Lsn(6000)), Lsn(5678));
assert_eq!(lsn.fetch_max(Lsn(5000)), Lsn(6000));
}
#[test]
fn test_lsn_serde() {
let original_lsn = Lsn(0x0123456789abcdef);
let expected_readable_tokens = Tokens(vec![Token::U64(0x0123456789abcdef)]);
let expected_non_readable_tokens =
Tokens(vec![Token::Str(String::from("1234567/89ABCDEF"))]);
// Testing human_readable ser/de
let serializer = Serializer::builder().is_human_readable(false).build();
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
assert_eq!(readable_ser_tokens, expected_readable_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(readable_ser_tokens)
.build();
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
assert_eq!(des_lsn, original_lsn);
// Testing NON human_readable ser/de
let serializer = Serializer::builder().is_human_readable(true).build();
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
assert_eq!(non_readable_ser_tokens, expected_non_readable_tokens);
let mut deserializer = Deserializer::builder()
.is_human_readable(true)
.tokens(non_readable_ser_tokens)
.build();
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
assert_eq!(des_lsn, original_lsn);
// Testing mismatching ser/de
let serializer = Serializer::builder().is_human_readable(false).build();
let non_readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
let mut deserializer = Deserializer::builder()
.is_human_readable(true)
.tokens(non_readable_ser_tokens)
.build();
Lsn::deserialize(&mut deserializer).unwrap_err();
let serializer = Serializer::builder().is_human_readable(true).build();
let readable_ser_tokens = original_lsn.serialize(&serializer).unwrap();
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(readable_ser_tokens)
.build();
Lsn::deserialize(&mut deserializer).unwrap_err();
}
#[test]
fn test_lsn_ensure_roundtrip() {
let original_lsn = Lsn(0xaaaabbbb);
let serializer = Serializer::builder().is_human_readable(false).build();
let ser_tokens = original_lsn.serialize(&serializer).unwrap();
let mut deserializer = Deserializer::builder()
.is_human_readable(false)
.tokens(ser_tokens)
.build();
let des_lsn = Lsn::deserialize(&mut deserializer).unwrap();
assert_eq!(des_lsn, original_lsn);
}
#[test]
fn test_lsn_bincode_serde() {
let lsn = Lsn(0x0123456789abcdef);
let expected_bytes = [0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef];
let ser_bytes = lsn.ser().unwrap();
assert_eq!(ser_bytes, expected_bytes);
let des_lsn = Lsn::des(&ser_bytes).unwrap();
assert_eq!(des_lsn, lsn);
}
#[test]
fn test_lsn_bincode_ensure_roundtrip() {
let original_lsn = Lsn(0x01_02_03_04_05_06_07_08);
let expected_bytes = vec![0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
let ser_bytes = original_lsn.ser().unwrap();
assert_eq!(ser_bytes, expected_bytes);
let des_lsn = Lsn::des(&ser_bytes).unwrap();
assert_eq!(des_lsn, original_lsn);
}
}

View File

@@ -3,7 +3,6 @@ use std::time::{Duration, SystemTime};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use pq_proto::{read_cstr, PG_EPOCH};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use tracing::{trace, warn};
use crate::lsn::Lsn;
@@ -15,21 +14,17 @@ use crate::lsn::Lsn;
///
/// serde Serialize is used only for human readable dump to json (e.g. in
/// safekeepers debug_dump).
#[serde_as]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct PageserverFeedback {
/// Last known size of the timeline. Used to enforce timeline size limit.
pub current_timeline_size: u64,
/// LSN last received and ingested by the pageserver. Controls backpressure.
#[serde_as(as = "DisplayFromStr")]
pub last_received_lsn: Lsn,
/// LSN up to which data is persisted by the pageserver to its local disc.
/// Controls backpressure.
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn,
/// LSN up to which data is persisted by the pageserver on s3; safekeepers
/// consider WAL before it can be removed.
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
// Serialize with RFC3339 format.
#[serde(with = "serde_systemtime")]

View File

@@ -1 +1,3 @@
pub mod heavier_once_cell;
pub mod gate;

151
libs/utils/src/sync/gate.rs Normal file
View File

@@ -0,0 +1,151 @@
use std::{sync::Arc, time::Duration};
/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
///
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
/// the resource calls `close()` when they want to ensure that all holders of guards
/// have released them, and that no future guards will be issued.
pub struct Gate {
/// Each caller of enter() takes one unit from the semaphore. In close(), we
/// take all the units to ensure all GateGuards are destroyed.
sem: Arc<tokio::sync::Semaphore>,
/// For observability only: a name that will be used to log warnings if a particular
/// gate is holding up shutdown
name: String,
}
/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
/// not complete.
#[derive(Debug)]
pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
async fn warn_if_stuck<Fut: std::future::Future>(
fut: Fut,
name: &str,
warn_period: std::time::Duration,
) -> <Fut as std::future::Future>::Output {
let started = std::time::Instant::now();
let mut fut = std::pin::pin!(fut);
loop {
match tokio::time::timeout(warn_period, &mut fut).await {
Ok(ret) => return ret,
Err(_) => {
tracing::warn!(
gate = name,
elapsed_ms = started.elapsed().as_millis(),
"still waiting, taking longer than expected..."
);
}
}
}
}
#[derive(Debug)]
pub enum GateError {
GateClosed,
}
impl Gate {
const MAX_UNITS: u32 = u32::MAX;
pub fn new(name: String) -> Self {
Self {
sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
name,
}
}
/// Acquire a guard that will prevent close() calls from completing. If close()
/// was already called, this will return an error which should be interpreted
/// as "shutting down".
///
/// This function would typically be used from e.g. request handlers. While holding
/// the guard returned from this function, it is important to respect a CancellationToken
/// to avoid blocking close() indefinitely: typically types that contain a Gate will
/// also contain a CancellationToken.
pub fn enter(&self) -> Result<GateGuard, GateError> {
self.sem
.clone()
.try_acquire_owned()
.map(GateGuard)
.map_err(|_| GateError::GateClosed)
}
/// Types with a shutdown() method and a gate should call this method at the
/// end of shutdown, to ensure that all GateGuard holders are done.
///
/// This will wait for all guards to be destroyed. For this to complete promptly, it is
/// important that the holders of such guards are respecting a CancellationToken which has
/// been cancelled before entering this function.
pub async fn close(&self) {
warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
}
async fn do_close(&self) {
tracing::debug!(gate = self.name, "Closing Gate...");
match self.sem.acquire_many(Self::MAX_UNITS).await {
Ok(_units) => {
// While holding all units, close the semaphore. All subsequent calls to enter() will fail.
self.sem.close();
}
Err(_) => {
// Semaphore closed: we are the only function that can do this, so it indicates a double-call.
// This is legal. Timeline::shutdown for example is not protected from being called more than
// once.
tracing::debug!(gate = self.name, "Double close")
}
}
tracing::debug!(gate = self.name, "Closed Gate.")
}
}
#[cfg(test)]
mod tests {
use futures::FutureExt;
use super::*;
#[tokio::test]
async fn test_idle_gate() {
// Having taken no gates, we should not be blocked in close
let gate = Gate::new("test".to_string());
gate.close().await;
// If a guard is dropped before entering, close should not be blocked
let gate = Gate::new("test".to_string());
let guard = gate.enter().unwrap();
drop(guard);
gate.close().await;
// Entering a closed guard fails
gate.enter().expect_err("enter should fail after close");
}
#[tokio::test]
async fn test_busy_gate() {
let gate = Gate::new("test".to_string());
let guard = gate.enter().unwrap();
let mut close_fut = std::pin::pin!(gate.close());
// Close should be blocked
assert!(close_fut.as_mut().now_or_never().is_none());
// Attempting to enter() should fail, even though close isn't done yet.
gate.enter()
.expect_err("enter should fail after entering close");
drop(guard);
// Guard is gone, close should finish
assert!(close_fut.as_mut().now_or_never().is_some());
// Attempting to enter() is still forbidden
gate.enter().expect_err("enter should fail finishing close");
}
}

View File

@@ -1,4 +1,7 @@
use std::sync::{Arc, Mutex, MutexGuard};
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc, Mutex, MutexGuard,
};
use tokio::sync::Semaphore;
/// Custom design like [`tokio::sync::OnceCell`] but using [`OwnedSemaphorePermit`] instead of
@@ -10,6 +13,7 @@ use tokio::sync::Semaphore;
/// [`OwnedSemaphorePermit`]: tokio::sync::OwnedSemaphorePermit
pub struct OnceCell<T> {
inner: Mutex<Inner<T>>,
initializers: AtomicUsize,
}
impl<T> Default for OnceCell<T> {
@@ -17,6 +21,7 @@ impl<T> Default for OnceCell<T> {
fn default() -> Self {
Self {
inner: Default::default(),
initializers: AtomicUsize::new(0),
}
}
}
@@ -49,6 +54,7 @@ impl<T> OnceCell<T> {
init_semaphore: Arc::new(sem),
value: Some(value),
}),
initializers: AtomicUsize::new(0),
}
}
@@ -60,8 +66,8 @@ impl<T> OnceCell<T> {
/// Initialization is panic-safe and cancellation-safe.
pub async fn get_or_init<F, Fut, E>(&self, factory: F) -> Result<Guard<'_, T>, E>
where
F: FnOnce() -> Fut,
Fut: std::future::Future<Output = Result<T, E>>,
F: FnOnce(InitPermit) -> Fut,
Fut: std::future::Future<Output = Result<(T, InitPermit), E>>,
{
let sem = {
let guard = self.inner.lock().unwrap();
@@ -71,29 +77,61 @@ impl<T> OnceCell<T> {
guard.init_semaphore.clone()
};
let permit = sem.acquire_owned().await;
if permit.is_err() {
let guard = self.inner.lock().unwrap();
assert!(
guard.value.is_some(),
"semaphore got closed, must be initialized"
);
return Ok(Guard(guard));
} else {
// now we try
let value = factory().await?;
let permit = {
// increment the count for the duration of queued
let _guard = CountWaitingInitializers::start(self);
sem.acquire_owned().await
};
let mut guard = self.inner.lock().unwrap();
assert!(
guard.value.is_none(),
"we won permit, must not be initialized"
);
guard.value = Some(value);
guard.init_semaphore.close();
Ok(Guard(guard))
match permit {
Ok(permit) => {
let permit = InitPermit(permit);
let (value, _permit) = factory(permit).await?;
let guard = self.inner.lock().unwrap();
Ok(Self::set0(value, guard))
}
Err(_closed) => {
let guard = self.inner.lock().unwrap();
assert!(
guard.value.is_some(),
"semaphore got closed, must be initialized"
);
return Ok(Guard(guard));
}
}
}
/// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
/// to complete initializing the inner value.
///
/// # Panics
///
/// If the inner has already been initialized.
pub fn set(&self, value: T, _permit: InitPermit) -> Guard<'_, T> {
let guard = self.inner.lock().unwrap();
// cannot assert that this permit is for self.inner.semaphore, but we can assert it cannot
// give more permits right now.
if guard.init_semaphore.try_acquire().is_ok() {
drop(guard);
panic!("permit is of wrong origin");
}
Self::set0(value, guard)
}
fn set0(value: T, mut guard: std::sync::MutexGuard<'_, Inner<T>>) -> Guard<'_, T> {
if guard.value.is_some() {
drop(guard);
unreachable!("we won permit, must not be initialized");
}
guard.value = Some(value);
guard.init_semaphore.close();
Guard(guard)
}
/// Returns a guard to an existing initialized value, if any.
pub fn get(&self) -> Option<Guard<'_, T>> {
let guard = self.inner.lock().unwrap();
@@ -103,6 +141,28 @@ impl<T> OnceCell<T> {
None
}
}
/// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
pub fn initializer_count(&self) -> usize {
self.initializers.load(Ordering::Relaxed)
}
}
/// DropGuard counter for queued tasks waiting to initialize, mainly accessible for the
/// initializing task for example at the end of initialization.
struct CountWaitingInitializers<'a, T>(&'a OnceCell<T>);
impl<'a, T> CountWaitingInitializers<'a, T> {
fn start(target: &'a OnceCell<T>) -> Self {
target.initializers.fetch_add(1, Ordering::Relaxed);
CountWaitingInitializers(target)
}
}
impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
fn drop(&mut self) {
self.0.initializers.fetch_sub(1, Ordering::Relaxed);
}
}
/// Uninteresting guard object to allow short-lived access to inspect or clone the held,
@@ -135,7 +195,7 @@ impl<'a, T> Guard<'a, T> {
///
/// The permit will be on a semaphore part of the new internal value, and any following
/// [`OnceCell::get_or_init`] will wait on it to complete.
pub fn take_and_deinit(&mut self) -> (T, tokio::sync::OwnedSemaphorePermit) {
pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
let mut swapped = Inner::default();
let permit = swapped
.init_semaphore
@@ -145,11 +205,14 @@ impl<'a, T> Guard<'a, T> {
std::mem::swap(&mut *self.0, &mut swapped);
swapped
.value
.map(|v| (v, permit))
.map(|v| (v, InitPermit(permit)))
.expect("guard is not created unless value has been initialized")
}
}
/// Type held by OnceCell (de)initializing task.
pub struct InitPermit(tokio::sync::OwnedSemaphorePermit);
#[cfg(test)]
mod tests {
use super::*;
@@ -185,11 +248,11 @@ mod tests {
barrier.wait().await;
let won = {
let g = cell
.get_or_init(|| {
.get_or_init(|permit| {
counters.factory_got_to_run.fetch_add(1, Ordering::Relaxed);
async {
counters.future_polled.fetch_add(1, Ordering::Relaxed);
Ok::<_, Infallible>(i)
Ok::<_, Infallible>((i, permit))
}
})
.await
@@ -243,7 +306,7 @@ mod tests {
deinitialization_started.wait().await;
let started_at = tokio::time::Instant::now();
cell.get_or_init(|| async { Ok::<_, Infallible>(reinit) })
cell.get_or_init(|permit| async { Ok::<_, Infallible>((reinit, permit)) })
.await
.unwrap();
@@ -258,18 +321,32 @@ mod tests {
assert_eq!(*cell.get().unwrap(), reinit);
}
#[test]
fn reinit_with_deinit_permit() {
let cell = Arc::new(OnceCell::new(42));
let (mol, permit) = cell.get().unwrap().take_and_deinit();
cell.set(5, permit);
assert_eq!(*cell.get().unwrap(), 5);
let (five, permit) = cell.get().unwrap().take_and_deinit();
assert_eq!(5, five);
cell.set(mol, permit);
assert_eq!(*cell.get().unwrap(), 42);
}
#[tokio::test]
async fn initialization_attemptable_until_ok() {
let cell = OnceCell::default();
for _ in 0..10 {
cell.get_or_init(|| async { Err("whatever error") })
cell.get_or_init(|_permit| async { Err("whatever error") })
.await
.unwrap_err();
}
let g = cell
.get_or_init(|| async { Ok::<_, Infallible>("finally success") })
.get_or_init(|permit| async { Ok::<_, Infallible>(("finally success", permit)) })
.await
.unwrap();
assert_eq!(*g, "finally success");
@@ -281,11 +358,11 @@ mod tests {
let barrier = tokio::sync::Barrier::new(2);
let initializer = cell.get_or_init(|| async {
let initializer = cell.get_or_init(|permit| async {
barrier.wait().await;
futures::future::pending::<()>().await;
Ok::<_, Infallible>("never reached")
Ok::<_, Infallible>(("never reached", permit))
});
tokio::select! {
@@ -298,7 +375,7 @@ mod tests {
assert!(cell.get().is_none());
let g = cell
.get_or_init(|| async { Ok::<_, Infallible>("now initialized") })
.get_or_init(|permit| async { Ok::<_, Infallible>(("now initialized", permit)) })
.await
.unwrap();
assert_eq!(*g, "now initialized");

37
libs/utils/src/timeout.rs Normal file
View File

@@ -0,0 +1,37 @@
use std::time::Duration;
use tokio_util::sync::CancellationToken;
pub enum TimeoutCancellableError {
Timeout,
Cancelled,
}
/// Wrap [`tokio::time::timeout`] with a CancellationToken.
///
/// This wrapper is appropriate for any long running operation in a task
/// that ought to respect a CancellationToken (which means most tasks).
///
/// The only time you should use a bare tokio::timeout is when the future `F`
/// itself respects a CancellationToken: otherwise, always use this wrapper
/// with your CancellationToken to ensure that your task does not hold up
/// graceful shutdown.
pub async fn timeout_cancellable<F>(
duration: Duration,
cancel: &CancellationToken,
future: F,
) -> Result<F::Output, TimeoutCancellableError>
where
F: std::future::Future,
{
tokio::select!(
r = tokio::time::timeout(duration, future) => {
r.map_err(|_| TimeoutCancellableError::Timeout)
},
_ = cancel.cancelled() => {
Err(TimeoutCancellableError::Cancelled)
}
)
}

View File

@@ -21,11 +21,6 @@ pub struct FileCacheState {
#[derive(Debug)]
pub struct FileCacheConfig {
/// Whether the file cache is *actually* stored in memory (e.g. by writing to
/// a tmpfs or shmem file). If true, the size of the file cache will be counted against the
/// memory available for the cgroup.
pub(crate) in_memory: bool,
/// The size of the file cache, in terms of the size of the resource it consumes
/// (currently: only memory)
///
@@ -59,22 +54,9 @@ pub struct FileCacheConfig {
spread_factor: f64,
}
impl FileCacheConfig {
pub fn default_in_memory() -> Self {
impl Default for FileCacheConfig {
fn default() -> Self {
Self {
in_memory: true,
// 75 %
resource_multiplier: 0.75,
// 640 MiB; (512 + 128)
min_remaining_after_cache: NonZeroU64::new(640 * MiB).unwrap(),
// ensure any increase in file cache size is split 90-10 with 10% to other memory
spread_factor: 0.1,
}
}
pub fn default_on_disk() -> Self {
Self {
in_memory: false,
resource_multiplier: 0.75,
// 256 MiB - lower than when in memory because overcommitting is safe; if we don't have
// memory, the kernel will just evict from its page cache, rather than e.g. killing
@@ -83,7 +65,9 @@ impl FileCacheConfig {
spread_factor: 0.1,
}
}
}
impl FileCacheConfig {
/// Make sure fields of the config are consistent.
pub fn validate(&self) -> anyhow::Result<()> {
// Single field validity

View File

@@ -39,16 +39,6 @@ pub struct Args {
#[arg(short, long)]
pub pgconnstr: Option<String>,
/// Flag to signal that the Postgres file cache is on disk (i.e. not in memory aside from the
/// kernel's page cache), and therefore should not count against available memory.
//
// NB: Ideally this flag would directly refer to whether the file cache is in memory (rather
// than a roundabout way, via whether it's on disk), but in order to be backwards compatible
// during the switch away from an in-memory file cache, we had to default to the previous
// behavior.
#[arg(long)]
pub file_cache_on_disk: bool,
/// The address we should listen on for connection requests. For the
/// agent, this is 0.0.0.0:10301. For the informant, this is 127.0.0.1:10369.
#[arg(short, long)]

View File

@@ -156,10 +156,7 @@ impl Runner {
// memory limits.
if let Some(connstr) = &args.pgconnstr {
info!("initializing file cache");
let config = match args.file_cache_on_disk {
true => FileCacheConfig::default_on_disk(),
false => FileCacheConfig::default_in_memory(),
};
let config = FileCacheConfig::default();
let mut file_cache = FileCacheState::new(connstr, config, token.clone())
.await
@@ -187,10 +184,7 @@ impl Runner {
info!("file cache size actually got set to {actual_size}")
}
if args.file_cache_on_disk {
file_cache_disk_size = actual_size;
}
file_cache_disk_size = actual_size;
state.filecache = Some(file_cache);
}
@@ -239,17 +233,11 @@ impl Runner {
let requested_mem = target.mem;
let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
let (expected_file_cache_size, expected_file_cache_disk_size) = self
let expected_file_cache_size = self
.filecache
.as_ref()
.map(|file_cache| {
let size = file_cache.config.calculate_cache_size(usable_system_memory);
match file_cache.config.in_memory {
true => (size, 0),
false => (size, size),
}
})
.unwrap_or((0, 0));
.map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
.unwrap_or(0);
if let Some(cgroup) = &self.cgroup {
let (last_time, last_history) = *cgroup.watcher.borrow();
@@ -273,7 +261,7 @@ impl Runner {
let new_threshold = self
.config
.cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
.cgroup_threshold(usable_system_memory, expected_file_cache_size);
let current = last_history.avg_non_reclaimable;
@@ -300,13 +288,10 @@ impl Runner {
.set_file_cache_size(expected_file_cache_size)
.await
.context("failed to set file cache size")?;
if !file_cache.config.in_memory {
file_cache_disk_size = actual_usage;
}
file_cache_disk_size = actual_usage;
let message = format!(
"set file cache size to {} MiB (in memory = {})",
"set file cache size to {} MiB",
bytes_to_mebibytes(actual_usage),
file_cache.config.in_memory,
);
info!("downscale: {message}");
status.push(message);
@@ -357,9 +342,7 @@ impl Runner {
.set_file_cache_size(expected_usage)
.await
.context("failed to set file cache size")?;
if !file_cache.config.in_memory {
file_cache_disk_size = actual_usage;
}
file_cache_disk_size = actual_usage;
if actual_usage != expected_usage {
warn!(

View File

@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
continue;
}
if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
// TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
// We can put in some prioritization for consumption metrics.
// Same for the loop that fetches computed metrics.

View File

@@ -3,7 +3,6 @@ use anyhow::Context;
use chrono::{DateTime, Utc};
use consumption_metrics::EventType;
use futures::stream::StreamExt;
use serde_with::serde_as;
use std::{sync::Arc, time::SystemTime};
use utils::{
id::{TenantId, TimelineId},
@@ -42,13 +41,10 @@ pub(super) enum Name {
///
/// This is a denormalization done at the MetricsKey const methods; these should not be constructed
/// elsewhere.
#[serde_with::serde_as]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
pub(crate) struct MetricsKey {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>,
@@ -206,7 +202,6 @@ pub(super) async fn collect_all_metrics(
None
} else {
crate::tenant::mgr::get_tenant(id, true)
.await
.ok()
.map(|tenant| (id, tenant))
}

View File

@@ -1,5 +1,4 @@
use consumption_metrics::{Event, EventChunk, IdempotencyKey, CHUNK_SIZE};
use serde_with::serde_as;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
@@ -7,12 +6,9 @@ use super::{metrics::Name, Cache, MetricsKey, RawMetric};
use utils::id::{TenantId, TimelineId};
/// How the metrics from pageserver are identified.
#[serde_with::serde_as]
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone, Copy, PartialEq)]
struct Ids {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub(super) tenant_id: TenantId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
#[serde(skip_serializing_if = "Option::is_none")]
pub(super) timeline_id: Option<TimelineId>,
}

View File

@@ -57,7 +57,10 @@ impl ControlPlaneClient {
if let Some(jwt) = &conf.control_plane_api_token {
let mut headers = hyper::HeaderMap::new();
headers.insert("Authorization", jwt.get_contents().parse().unwrap());
headers.insert(
"Authorization",
format!("Bearer {}", jwt.get_contents()).parse().unwrap(),
);
client = client.default_headers(headers);
}

View File

@@ -10,6 +10,7 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::remote_timeline_client::remote_timeline_path;
use crate::virtual_file::MaybeFatalIo;
use crate::virtual_file::VirtualFile;
use anyhow::Context;
use camino::Utf8PathBuf;
@@ -17,7 +18,6 @@ use hex::FromHex;
use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::Deserialize;
use serde::Serialize;
use serde_with::serde_as;
use thiserror::Error;
use tokio;
use tokio_util::sync::CancellationToken;
@@ -214,7 +214,6 @@ where
/// during recovery as startup.
const TEMP_SUFFIX: &str = "tmp";
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
struct DeletionList {
/// Serialization version, for future use
@@ -243,7 +242,6 @@ struct DeletionList {
validated: bool,
}
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
struct DeletionHeader {
/// Serialization version, for future use
@@ -271,7 +269,9 @@ impl DeletionHeader {
let temp_path = path_with_suffix_extension(&header_path, TEMP_SUFFIX);
VirtualFile::crashsafe_overwrite(&header_path, &temp_path, &header_bytes)
.await
.map_err(Into::into)
.maybe_fatal_err("save deletion header")?;
Ok(())
}
}
@@ -360,6 +360,7 @@ impl DeletionList {
let bytes = serde_json::to_vec(self).expect("Failed to serialize deletion list");
VirtualFile::crashsafe_overwrite(&path, &temp_path, &bytes)
.await
.maybe_fatal_err("save deletion list")
.map_err(Into::into)
}
}

View File

@@ -34,6 +34,8 @@ use crate::deletion_queue::TEMP_SUFFIX;
use crate::metrics;
use crate::tenant::remote_timeline_client::remote_layer_path;
use crate::tenant::storage_layer::LayerFileName;
use crate::virtual_file::on_fatal_io_error;
use crate::virtual_file::MaybeFatalIo;
// The number of keys in a DeletionList before we will proactively persist it
// (without reaching a flush deadline). This aims to deliver objects of the order
@@ -195,7 +197,7 @@ impl ListWriter {
debug!("Deletion header {header_path} not found, first start?");
Ok(None)
} else {
Err(anyhow::anyhow!(e))
on_fatal_io_error(&e, "reading deletion header");
}
}
}
@@ -216,16 +218,9 @@ impl ListWriter {
self.pending.sequence = validated_sequence + 1;
let deletion_directory = self.conf.deletion_prefix();
let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
Ok(d) => d,
Err(e) => {
warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");
// Give up: if we can't read the deletion list directory, we probably can't
// write lists into it later, so the queue won't work.
return Err(e.into());
}
};
let mut dir = tokio::fs::read_dir(&deletion_directory)
.await
.fatal_err("read deletion directory");
let list_name_pattern =
Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();
@@ -233,7 +228,7 @@ impl ListWriter {
let temp_extension = format!(".{TEMP_SUFFIX}");
let header_path = self.conf.deletion_header_path();
let mut seqs: Vec<u64> = Vec::new();
while let Some(dentry) = dir.next_entry().await? {
while let Some(dentry) = dir.next_entry().await.fatal_err("read deletion dentry") {
let file_name = dentry.file_name();
let dentry_str = file_name.to_string_lossy();
@@ -246,11 +241,9 @@ impl ListWriter {
info!("Cleaning up temporary file {dentry_str}");
let absolute_path =
deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
// Non-fatal error: we will just leave the file behind but not
// try and load it.
warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
}
tokio::fs::remove_file(&absolute_path)
.await
.fatal_err("delete temp file");
continue;
}
@@ -290,7 +283,9 @@ impl ListWriter {
for s in seqs {
let list_path = self.conf.deletion_list_path(s);
let list_bytes = tokio::fs::read(&list_path).await?;
let list_bytes = tokio::fs::read(&list_path)
.await
.fatal_err("read deletion list");
let mut deletion_list = match serde_json::from_slice::<DeletionList>(&list_bytes) {
Ok(l) => l,

View File

@@ -28,6 +28,7 @@ use crate::config::PageServerConf;
use crate::control_plane_client::ControlPlaneGenerationsApi;
use crate::control_plane_client::RetryForeverError;
use crate::metrics;
use crate::virtual_file::MaybeFatalIo;
use super::deleter::DeleterMessage;
use super::DeletionHeader;
@@ -287,16 +288,9 @@ where
async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
for list_path in list_paths {
debug!("Removing deletion list {list_path}");
if let Err(e) = tokio::fs::remove_file(&list_path).await {
// Unexpected: we should have permissions and nothing else should
// be touching these files. We will leave the file behind. Subsequent
// pageservers will try and load it again: hopefully whatever storage
// issue (probably permissions) has been fixed by then.
tracing::error!("Failed to delete {list_path}: {e:#}");
metrics::DELETION_QUEUE.unexpected_errors.inc();
break;
}
tokio::fs::remove_file(&list_path)
.await
.fatal_err("remove deletion list");
}
}

View File

@@ -403,7 +403,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
return (evicted_bytes, evictions_failed);
};
let results = timeline.evict_layers(&batch, &cancel).await;
let results = timeline.evict_layers(&batch).await;
match results {
Ok(results) => {
@@ -545,7 +545,7 @@ async fn collect_eviction_candidates(
if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled);
}
let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
Ok(tenant) => tenant,
Err(e) => {
// this can happen if tenant has lifecycle transition after we fetched it
@@ -554,6 +554,11 @@ async fn collect_eviction_candidates(
}
};
if tenant.cancel.is_cancelled() {
info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
continue;
}
// collect layers from all timelines in this tenant
//
// If one of the timelines becomes `!is_active()` during the iteration,

View File

@@ -569,7 +569,17 @@ paths:
schema:
$ref: "#/components/schemas/NotFoundError"
"409":
description: Tenant download is already in progress
description: |
The tenant is already known to Pageserver in some way,
and hence this `/attach` call has been rejected.
Some examples of how this can happen:
- tenant was created on this pageserver
- tenant attachment was started by an earlier call to `/attach`.
Callers should poll the tenant status's `attachment_status` field,
like for status 202. See the longer description for `POST /attach`
for details.
content:
application/json:
schema:

View File

@@ -17,7 +17,6 @@ use pageserver_api::models::{
TenantLoadRequest, TenantLocationConfigRequest,
};
use remote_storage::GenericRemoteStorage;
use serde_with::{serde_as, DisplayFromStr};
use tenant_size_model::{SizeResult, StorageModel};
use tokio_util::sync::CancellationToken;
use tracing::*;
@@ -36,7 +35,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::task_mgr::TaskKind;
use crate::tenant::config::{LocationConf, TenantConfOpt};
use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
TenantSlotUpsertError, TenantStateError,
};
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -147,28 +147,59 @@ impl From<PageReconstructError> for ApiError {
impl From<TenantMapInsertError> for ApiError {
fn from(tmie: TenantMapInsertError) -> ApiError {
match tmie {
TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
ApiError::ResourceUnavailable(format!("{tmie}").into())
}
TenantMapInsertError::TenantAlreadyExists(id, state) => {
ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
}
TenantMapInsertError::TenantExistsSecondary(id) => {
ApiError::Conflict(format!("tenant {id} already exists as secondary"))
}
TenantMapInsertError::SlotError(e) => e.into(),
TenantMapInsertError::SlotUpsertError(e) => e.into(),
TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
}
}
}
impl From<TenantSlotError> for ApiError {
fn from(e: TenantSlotError) -> ApiError {
use TenantSlotError::*;
match e {
NotFound(tenant_id) => {
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
}
e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
InProgress => {
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
}
MapState(e) => e.into(),
}
}
}
impl From<TenantSlotUpsertError> for ApiError {
fn from(e: TenantSlotUpsertError) -> ApiError {
use TenantSlotUpsertError::*;
match e {
InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
MapState(e) => e.into(),
}
}
}
impl From<TenantMapError> for ApiError {
fn from(e: TenantMapError) -> ApiError {
use TenantMapError::*;
match e {
StillInitializing | ShuttingDown => {
ApiError::ResourceUnavailable(format!("{e}").into())
}
}
}
}
impl From<TenantStateError> for ApiError {
fn from(tse: TenantStateError) -> ApiError {
match tse {
TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
TenantStateError::IsStopping(_) => {
ApiError::ResourceUnavailable("Tenant is stopping".into())
}
_ => ApiError::InternalServerError(anyhow::Error::new(tse)),
TenantStateError::SlotError(e) => e.into(),
TenantStateError::SlotUpsertError(e) => e.into(),
TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
}
}
}
@@ -189,6 +220,7 @@ impl From<GetTenantError> for ApiError {
// (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
ApiError::ResourceUnavailable("Tenant not yet active".into())
}
GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
}
}
}
@@ -243,6 +275,9 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
Get(g) => ApiError::from(g),
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
Timeline(t) => ApiError::from(t),
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
SlotError(e) => e.into(),
SlotUpsertError(e) => e.into(),
Other(o) => ApiError::InternalServerError(o),
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
}
@@ -369,7 +404,7 @@ async fn timeline_create_handler(
let state = get_state(&request);
async {
let tenant = mgr::get_tenant(tenant_id, true).await?;
let tenant = mgr::get_tenant(tenant_id, true)?;
match tenant.create_timeline(
new_timeline_id,
request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -397,6 +432,9 @@ async fn timeline_create_handler(
Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
}
Err(tenant::CreateTimelineError::ShuttingDown) => {
json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
}
Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
}
}
@@ -416,7 +454,7 @@ async fn timeline_list_handler(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let response_data = async {
let tenant = mgr::get_tenant(tenant_id, true).await?;
let tenant = mgr::get_tenant(tenant_id, true)?;
let timelines = tenant.list_timelines();
let mut response_data = Vec::with_capacity(timelines.len());
@@ -455,7 +493,7 @@ async fn timeline_detail_handler(
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let timeline_info = async {
let tenant = mgr::get_tenant(tenant_id, true).await?;
let tenant = mgr::get_tenant(tenant_id, true)?;
let timeline = tenant
.get_timeline(timeline_id, false)
@@ -499,10 +537,8 @@ async fn get_lsn_by_timestamp_handler(
let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
if version.unwrap_or(0) > 1 {
#[serde_as]
#[derive(serde::Serialize)]
struct Result {
#[serde_as(as = "DisplayFromStr")]
lsn: Lsn,
kind: &'static str,
}
@@ -713,7 +749,7 @@ async fn tenant_status(
check_permission(&request, Some(tenant_id))?;
let tenant_info = async {
let tenant = mgr::get_tenant(tenant_id, false).await?;
let tenant = mgr::get_tenant(tenant_id, false)?;
// Calculate total physical size of all timelines
let mut current_physical_size = 0;
@@ -776,7 +812,7 @@ async fn tenant_size_handler(
let headers = request.headers();
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
let tenant = mgr::get_tenant(tenant_id, true).await?;
let tenant = mgr::get_tenant(tenant_id, true)?;
// this can be long operation
let inputs = tenant
@@ -811,10 +847,8 @@ async fn tenant_size_handler(
}
/// The type resides in the pageserver not to expose `ModelInputs`.
#[serde_with::serde_as]
#[derive(serde::Serialize)]
struct TenantHistorySize {
#[serde_as(as = "serde_with::DisplayFromStr")]
id: TenantId,
/// Size is a mixture of WAL and logical size, so the unit is bytes.
///
@@ -1035,7 +1069,7 @@ async fn get_tenant_config_handler(
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let tenant = mgr::get_tenant(tenant_id, false).await?;
let tenant = mgr::get_tenant(tenant_id, false)?;
let response = HashMap::from([
(
@@ -1094,7 +1128,7 @@ async fn put_tenant_location_config_handler(
.await
{
match e {
TenantStateError::NotFound(_) => {
TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
// This API is idempotent: a NotFound on a detach is fine.
}
_ => return Err(e.into()),
@@ -1132,7 +1166,6 @@ async fn handle_tenant_break(
let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
.await
.map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
tenant.set_broken("broken from test".to_owned()).await;
@@ -1437,7 +1470,7 @@ async fn active_timeline_of_active_tenant(
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>, ApiError> {
let tenant = mgr::get_tenant(tenant_id, true).await?;
let tenant = mgr::get_tenant(tenant_id, true)?;
tenant
.get_timeline(timeline_id, true)
.map_err(|e| ApiError::NotFound(e.into()))

View File

@@ -61,14 +61,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
)
.await;
// Shut down any page service tasks.
timed(
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
"shutdown PageRequestHandlers",
Duration::from_secs(1),
)
.await;
// Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC tasks.
timed(
@@ -78,6 +70,15 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
)
.await;
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
// should already have been canclled via mgr::shutdown_all_tenants
timed(
task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
"shutdown PageRequestHandlers",
Duration::from_secs(1),
)
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects
if let Some(mut deletion_queue) = deletion_queue {
deletion_queue.shutdown(Duration::from_secs(5)).await;

View File

@@ -962,6 +962,32 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
.expect("failed to define a metric")
});
pub(crate) struct TenantManagerMetrics {
pub(crate) tenant_slots: UIntGauge,
pub(crate) tenant_slot_writes: IntCounter,
pub(crate) unexpected_errors: IntCounter,
}
pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
TenantManagerMetrics {
tenant_slots: register_uint_gauge!(
"pageserver_tenant_manager_slots",
"How many slots currently exist, including all attached, secondary and in-progress operations",
)
.expect("failed to define a metric"),
tenant_slot_writes: register_int_counter!(
"pageserver_tenant_manager_slot_writes",
"Writes to a tenant slot, including all of create/attach/detach/delete"
)
.expect("failed to define a metric"),
unexpected_errors: register_int_counter!(
"pageserver_tenant_manager_unexpected_errors_total",
"Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
)
.expect("failed to define a metric"),
}
});
pub(crate) struct DeletionQueueMetrics {
pub(crate) keys_submitted: IntCounter,
pub(crate) keys_dropped: IntCounter,
@@ -1884,6 +1910,9 @@ pub fn preinitialize_metrics() {
// Deletion queue stats
Lazy::force(&DELETION_QUEUE);
// Tenant manager stats
Lazy::force(&TENANT_MANAGER);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
.into_iter()

View File

@@ -14,7 +14,6 @@ use async_compression::tokio::write::GzipEncoder;
use bytes::Buf;
use bytes::Bytes;
use futures::Stream;
use pageserver_api::models::TenantState;
use pageserver_api::models::{
PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -55,16 +54,20 @@ use crate::metrics;
use crate::metrics::LIVE_CONNECTIONS_COUNT;
use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant;
use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
use crate::tenant::mgr;
use crate::tenant::mgr::GetTenantError;
use crate::tenant::{Tenant, Timeline};
use crate::tenant::mgr::get_active_tenant_with_timeout;
use crate::tenant::mgr::GetActiveTenantError;
use crate::tenant::Timeline;
use crate::trace::Tracer;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
use postgres_ffi::BLCKSZ;
// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
// is not yet in state [`TenantState::Active`].
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
/// Read the end of a tar archive.
///
/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -223,13 +226,7 @@ async fn page_service_conn_main(
// and create a child per-query context when it invokes process_query.
// But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
// and create the per-query context in process_query ourselves.
let mut conn_handler = PageServerHandler::new(
conf,
broker_client,
auth,
connection_ctx,
task_mgr::shutdown_token(),
);
let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
match pgbackend
@@ -263,10 +260,6 @@ struct PageServerHandler {
/// For each query received over the connection,
/// `process_query` creates a child context from this one.
connection_ctx: RequestContext,
/// A token that should fire when the tenant transitions from
/// attached state, or when the pageserver is shutting down.
cancel: CancellationToken,
}
impl PageServerHandler {
@@ -275,7 +268,6 @@ impl PageServerHandler {
broker_client: storage_broker::BrokerClientChannel,
auth: Option<Arc<JwtAuth>>,
connection_ctx: RequestContext,
cancel: CancellationToken,
) -> Self {
PageServerHandler {
_conf: conf,
@@ -283,7 +275,6 @@ impl PageServerHandler {
auth,
claims: None,
connection_ctx,
cancel,
}
}
@@ -291,7 +282,11 @@ impl PageServerHandler {
/// this rather than naked flush() in order to shut down promptly. Without this, we would
/// block shutdown of a tenant if a postgres client was failing to consume bytes we send
/// in the flush.
async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
async fn flush_cancellable<IO>(
&self,
pgb: &mut PostgresBackend<IO>,
cancel: &CancellationToken,
) -> Result<(), QueryError>
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
@@ -299,7 +294,7 @@ impl PageServerHandler {
flush_r = pgb.flush() => {
Ok(flush_r?)
},
_ = self.cancel.cancelled() => {
_ = cancel.cancelled() => {
Err(QueryError::Shutdown)
}
)
@@ -308,6 +303,7 @@ impl PageServerHandler {
fn copyin_stream<'a, IO>(
&'a self,
pgb: &'a mut PostgresBackend<IO>,
cancel: &'a CancellationToken,
) -> impl Stream<Item = io::Result<Bytes>> + 'a
where
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -317,7 +313,7 @@ impl PageServerHandler {
let msg = tokio::select! {
biased;
_ = self.cancel.cancelled() => {
_ = cancel.cancelled() => {
// We were requested to shut down.
let msg = "pageserver is shutting down";
let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
@@ -357,7 +353,7 @@ impl PageServerHandler {
let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
// error can't happen here, ErrorResponse serialization should be always ok
pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
}
Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
@@ -384,12 +380,13 @@ impl PageServerHandler {
{
debug_assert_current_span_has_tenant_and_timeline_id();
// NOTE: pagerequests handler exits when connection is closed,
// so there is no need to reset the association
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Make request tracer if needed
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
let tenant = mgr::get_active_tenant_with_timeout(
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path = tenant
@@ -405,9 +402,14 @@ impl PageServerHandler {
.get_timeline(timeline_id, true)
.map_err(|e| anyhow::anyhow!(e))?;
// Avoid starting new requests if the timeline has already started shutting down,
// and block timeline shutdown until this request is complete, or drops out due
// to cancellation.
let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
// switch client to COPYBOTH
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
self.flush_cancellable(pgb).await?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
@@ -415,7 +417,7 @@ impl PageServerHandler {
let msg = tokio::select! {
biased;
_ = self.cancel.cancelled() => {
_ = timeline.cancel.cancelled() => {
// We were requested to shut down.
info!("shutdown request received in page handler");
return Err(QueryError::Shutdown)
@@ -490,9 +492,20 @@ impl PageServerHandler {
}
};
if let Err(e) = &response {
if timeline.cancel.is_cancelled() {
// If we fail to fulfil a request during shutdown, which may be _because_ of
// shutdown, then do not send the error to the client. Instead just drop the
// connection.
span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
return Err(QueryError::Shutdown);
}
}
let response = response.unwrap_or_else(|e| {
// print the all details to the log with {:#}, but for the client the
// error message is enough
// error message is enough. Do not log if shutting down, as the anyhow::Error
// here includes cancellation which is not an error.
span.in_scope(|| error!("error reading relation or page version: {:#}", e));
PagestreamBeMessage::Error(PagestreamErrorResponse {
message: e.to_string(),
@@ -500,7 +513,7 @@ impl PageServerHandler {
});
pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
self.flush_cancellable(pgb).await?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
}
Ok(())
}
@@ -522,10 +535,14 @@ impl PageServerHandler {
{
debug_assert_current_span_has_tenant_and_timeline_id();
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Create empty timeline
info!("creating new timeline");
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
let tenant = get_active_tenant_with_timeout(
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
let timeline = tenant
.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
.await?;
@@ -543,9 +560,9 @@ impl PageServerHandler {
// Import basebackup provided via CopyData
info!("importing basebackup");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb).await?;
self.flush_cancellable(pgb, &tenant.cancel).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
timeline
.import_basebackup_from_tar(
&mut copyin_reader,
@@ -582,9 +599,10 @@ impl PageServerHandler {
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
{
debug_assert_current_span_has_tenant_and_timeline_id();
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let last_record_lsn = timeline.get_last_record_lsn();
if last_record_lsn != start_lsn {
return Err(QueryError::Other(
@@ -598,8 +616,8 @@ impl PageServerHandler {
// Import wal provided via CopyData
info!("importing wal");
pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
self.flush_cancellable(pgb).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
self.flush_cancellable(pgb, &timeline.cancel).await?;
let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
info!("wal import complete");
@@ -792,7 +810,9 @@ impl PageServerHandler {
let started = std::time::Instant::now();
// check that the timeline exists
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
if let Some(lsn) = lsn {
// Backup was requested at a particular LSN. Wait for it to arrive.
@@ -807,7 +827,7 @@ impl PageServerHandler {
// switch client to COPYOUT
pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
self.flush_cancellable(pgb).await?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
// Send a tarball of the latest layer on the timeline. Compress if not
// fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -859,7 +879,7 @@ impl PageServerHandler {
}
pgb.write_message_noflush(&BeMessage::CopyDone)?;
self.flush_cancellable(pgb).await?;
self.flush_cancellable(pgb, &timeline.cancel).await?;
let basebackup_after = started
.elapsed()
@@ -891,6 +911,25 @@ impl PageServerHandler {
.expect("claims presence already checked");
check_permission(claims, tenant_id)
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
async fn get_active_tenant_timeline(
&self,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
Ok(timeline)
}
}
#[async_trait::async_trait]
@@ -1048,7 +1087,9 @@ where
.record("timeline_id", field::display(timeline_id));
self.check_permission(Some(tenant_id))?;
let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
let timeline = self
.get_active_tenant_timeline(tenant_id, timeline_id)
.await?;
let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1232,7 +1273,12 @@ where
self.check_permission(Some(tenant_id))?;
let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
let tenant = get_active_tenant_with_timeout(
tenant_id,
ACTIVE_TENANT_TIMEOUT,
&task_mgr::shutdown_token(),
)
.await?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"checkpoint_distance"),
RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1278,67 +1324,13 @@ where
}
}
#[derive(thiserror::Error, Debug)]
enum GetActiveTenantError {
#[error(
"Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
)]
WaitForActiveTimeout {
latest_state: TenantState,
wait_time: Duration,
},
#[error(transparent)]
NotFound(GetTenantError),
#[error(transparent)]
WaitTenantActive(tenant::WaitToBecomeActiveError),
}
impl From<GetActiveTenantError> for QueryError {
fn from(e: GetActiveTenantError) -> Self {
match e {
GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
),
GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
}
}
}
/// Get active tenant.
///
/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
/// ensures that queries don't fail immediately after pageserver startup, because
/// all tenants are still loading.
async fn get_active_tenant_with_timeout(
tenant_id: TenantId,
_ctx: &RequestContext, /* require get a context to support cancellation in the future */
) -> Result<Arc<Tenant>, GetActiveTenantError> {
let tenant = match mgr::get_tenant(tenant_id, false).await {
Ok(tenant) => tenant,
Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
Err(GetTenantError::NotActive(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
}
Err(GetTenantError::Broken(_)) => {
unreachable!("we're calling get_tenant with active_only=false")
}
};
let wait_time = Duration::from_secs(30);
match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
Ok(Ok(())) => Ok(tenant),
// no .context(), the error message is good enough and some tests depend on it
Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
Err(_) => {
let latest_state = tenant.current_state();
if latest_state == TenantState::Active {
Ok(tenant)
} else {
Err(GetActiveTenantError::WaitForActiveTimeout {
latest_state,
wait_time,
})
}
e => QueryError::Other(anyhow::anyhow!(e)),
}
}
}
@@ -1359,18 +1351,3 @@ impl From<GetActiveTimelineError> for QueryError {
}
}
}
/// Shorthand for getting a reference to a Timeline of an Active tenant.
async fn get_active_tenant_timeline(
tenant_id: TenantId,
timeline_id: TimelineId,
ctx: &RequestContext,
) -> Result<Arc<Timeline>, GetActiveTimelineError> {
let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
.await
.map_err(GetActiveTimelineError::Tenant)?;
let timeline = tenant
.get_timeline(timeline_id, true)
.map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
Ok(timeline)
}

View File

@@ -57,6 +57,17 @@ pub enum CalculateLogicalSizeError {
Other(#[from] anyhow::Error),
}
impl From<PageReconstructError> for CalculateLogicalSizeError {
fn from(pre: PageReconstructError) -> Self {
match pre {
PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
Self::Cancelled
}
_ => Self::Other(pre.into()),
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum RelationError {
#[error("Relation Already Exists")]
@@ -594,7 +605,7 @@ impl Timeline {
crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
let mut total_size: u64 = 0;
@@ -608,10 +619,7 @@ impl Timeline {
return Err(CalculateLogicalSizeError::Cancelled);
}
let relsize_key = rel_size_to_key(rel);
let mut buf = self
.get(relsize_key, lsn, ctx)
.await
.with_context(|| format!("read relation size of {rel:?}"))?;
let mut buf = self.get(relsize_key, lsn, ctx).await?;
let relsize = buf.get_u32_le();
total_size += relsize as u64;

View File

@@ -299,10 +299,6 @@ pub enum TaskKind {
#[derive(Default)]
struct MutableTaskState {
/// Tenant and timeline that this task is associated with.
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
/// Handle for waiting for the task to exit. It can be None, if the
/// the task has already exited.
join_handle: Option<JoinHandle<()>>,
@@ -319,6 +315,11 @@ struct PageServerTask {
// To request task shutdown, just cancel this token.
cancel: CancellationToken,
/// Tasks may optionally be launched for a particular tenant/timeline, enabling
/// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
tenant_id: Option<TenantId>,
timeline_id: Option<TimelineId>,
mutable: Mutex<MutableTaskState>,
}
@@ -344,11 +345,9 @@ where
kind,
name: name.to_string(),
cancel: cancel.clone(),
mutable: Mutex::new(MutableTaskState {
tenant_id,
timeline_id,
join_handle: None,
}),
tenant_id,
timeline_id,
mutable: Mutex::new(MutableTaskState { join_handle: None }),
});
TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
@@ -418,8 +417,6 @@ async fn task_finish(
let mut shutdown_process = false;
{
let task_mut = task.mutable.lock().unwrap();
match result {
Ok(Ok(())) => {
debug!("Task '{}' exited normally", task_name);
@@ -428,13 +425,13 @@ async fn task_finish(
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task_mut.tenant_id, task_mut.timeline_id, err
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task_mut.tenant_id, task_mut.timeline_id, err
task_name, task.tenant_id, task.timeline_id, err
);
}
}
@@ -442,13 +439,13 @@ async fn task_finish(
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task_mut.tenant_id, task_mut.timeline_id, err
task_name, task.tenant_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task_mut.tenant_id, task_mut.timeline_id, err
task_name, task.tenant_id, task.timeline_id, err
);
}
}
@@ -460,17 +457,6 @@ async fn task_finish(
}
}
// expected to be called from the task of the given id.
pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
CURRENT_TASK.with(|ct| {
let mut task_mut = ct.mutable.lock().unwrap();
task_mut.tenant_id = tenant_id;
task_mut.timeline_id = timeline_id;
});
}
/// Is there a task running that matches the criteria
/// Signal and wait for tasks to shut down.
///
///
@@ -493,17 +479,16 @@ pub async fn shutdown_tasks(
{
let tasks = TASKS.lock().unwrap();
for task in tasks.values() {
let task_mut = task.mutable.lock().unwrap();
if (kind.is_none() || Some(task.kind) == kind)
&& (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
&& (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
&& (tenant_id.is_none() || task.tenant_id == tenant_id)
&& (timeline_id.is_none() || task.timeline_id == timeline_id)
{
task.cancel.cancel();
victim_tasks.push((
Arc::clone(task),
task.kind,
task_mut.tenant_id,
task_mut.timeline_id,
task.tenant_id,
task.timeline_id,
));
}
}

View File

@@ -26,6 +26,7 @@ use tracing::*;
use utils::completion;
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext;
use utils::sync::gate::Gate;
use std::cmp::min;
use std::collections::hash_map::Entry;
@@ -54,6 +55,8 @@ use self::config::TenantConf;
use self::delete::DeleteTenantFlow;
use self::metadata::LoadMetadataError;
use self::metadata::TimelineMetadata;
use self::mgr::GetActiveTenantError;
use self::mgr::GetTenantError;
use self::mgr::TenantsMap;
use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::uninit::TimelineUninitMark;
@@ -252,6 +255,20 @@ pub struct Tenant {
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
// Cancellation token fires when we have entered shutdown(). This is a parent of
// Timelines' cancellation token.
pub(crate) cancel: CancellationToken,
// Users of the Tenant such as the page service must take this Gate to avoid
// trying to use a Tenant which is shutting down.
pub(crate) gate: Gate,
}
impl std::fmt::Debug for Tenant {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} ({})", self.tenant_id, self.current_state())
}
}
pub(crate) enum WalRedoManager {
@@ -359,34 +376,6 @@ impl Debug for SetStoppingError {
}
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum WaitToBecomeActiveError {
WillNotBecomeActive {
tenant_id: TenantId,
state: TenantState,
},
TenantDropped {
tenant_id: TenantId,
},
}
impl std::fmt::Display for WaitToBecomeActiveError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
write!(
f,
"Tenant {} will not become active. Current state: {:?}",
tenant_id, state
)
}
WaitToBecomeActiveError::TenantDropped { tenant_id } => {
write!(f, "Tenant {tenant_id} will not become active (dropped)")
}
}
}
}
#[derive(thiserror::Error, Debug)]
pub enum CreateTimelineError {
#[error("a timeline with the given ID already exists")]
@@ -395,6 +384,8 @@ pub enum CreateTimelineError {
AncestorLsn(anyhow::Error),
#[error("ancestor timeline is not active")]
AncestorNotActive,
#[error("tenant shutting down")]
ShuttingDown,
#[error(transparent)]
Other(#[from] anyhow::Error),
}
@@ -526,7 +517,7 @@ impl Tenant {
resources: TenantSharedResources,
attached_conf: AttachedTenantConf,
init_order: Option<InitializationOrder>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenants: &'static std::sync::RwLock<TenantsMap>,
mode: SpawnMode,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
@@ -1524,6 +1515,11 @@ impl Tenant {
)));
}
let _gate = self
.gate
.enter()
.map_err(|_| CreateTimelineError::ShuttingDown)?;
if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
debug!("timeline {new_timeline_id} already exists");
@@ -1808,6 +1804,7 @@ impl Tenant {
freeze_and_flush: bool,
) -> Result<(), completion::Barrier> {
span::debug_assert_current_span_has_tenant_id();
// Set tenant (and its timlines) to Stoppping state.
//
// Since we can only transition into Stopping state after activation is complete,
@@ -1833,6 +1830,7 @@ impl Tenant {
}
Err(SetStoppingError::AlreadyStopping(other)) => {
// give caller the option to wait for this this shutdown
info!("Tenant::shutdown: AlreadyStopping");
return Err(other);
}
};
@@ -1846,6 +1844,7 @@ impl Tenant {
js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
})
};
tracing::info!("Waiting for timelines...");
while let Some(res) = js.join_next().await {
match res {
Ok(()) => {}
@@ -1855,12 +1854,21 @@ impl Tenant {
}
}
// We cancel the Tenant's cancellation token _after_ the timelines have all shut down. This permits
// them to continue to do work during their shutdown methods, e.g. flushing data.
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// shutdown all tenant and timeline tasks: gc, compaction, page service
// No new tasks will be started for this tenant because it's in `Stopping` state.
//
// this will additionally shutdown and await all timeline tasks.
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;
// Wait for any in-flight operations to complete
self.gate.close().await;
Ok(())
}
@@ -2021,7 +2029,7 @@ impl Tenant {
self.state.subscribe()
}
pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
let mut receiver = self.state.subscribe();
loop {
let current_state = receiver.borrow_and_update().clone();
@@ -2029,11 +2037,9 @@ impl Tenant {
TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
// in these states, there's a chance that we can reach ::Active
receiver.changed().await.map_err(
|_e: tokio::sync::watch::error::RecvError| {
WaitToBecomeActiveError::TenantDropped {
tenant_id: self.tenant_id,
}
},
|_e: tokio::sync::watch::error::RecvError|
// Tenant existed but was dropped: report it as non-existent
GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id))
)?;
}
TenantState::Active { .. } => {
@@ -2041,10 +2047,7 @@ impl Tenant {
}
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
// There's no chance the tenant can transition back into ::Active
return Err(WaitToBecomeActiveError::WillNotBecomeActive {
tenant_id: self.tenant_id,
state: current_state,
});
return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
}
}
}
@@ -2110,6 +2113,9 @@ where
}
impl Tenant {
pub fn get_tenant_id(&self) -> TenantId {
self.tenant_id
}
pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
self.tenant_conf.read().unwrap().tenant_conf
}
@@ -2267,6 +2273,7 @@ impl Tenant {
initial_logical_size_can_start.cloned(),
initial_logical_size_attempt.cloned().flatten(),
state,
self.cancel.child_token(),
);
Ok(timeline)
@@ -2356,6 +2363,8 @@ impl Tenant {
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
cancel: CancellationToken::default(),
gate: Gate::new(format!("Tenant<{tenant_id}>")),
}
}
@@ -3692,7 +3701,7 @@ mod tests {
use tokio_util::sync::CancellationToken;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
#[tokio::test]
async fn test_basic() -> anyhow::Result<()> {
@@ -3788,9 +3797,9 @@ mod tests {
let writer = tline.writer().await;
#[allow(non_snake_case)]
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
#[allow(non_snake_case)]
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap();
// Insert a value on the timeline
writer
@@ -4236,11 +4245,7 @@ mod tests {
metadata_bytes[8] ^= 1;
std::fs::write(metadata_path, metadata_bytes)?;
let err = harness
.try_load_local(&ctx)
.await
.err()
.expect("should fail");
let err = harness.try_load_local(&ctx).await.expect_err("should fail");
// get all the stack with all .context, not only the last one
let message = format!("{err:#}");
let expected = "failed to load metadata";
@@ -4374,7 +4379,7 @@ mod tests {
let mut keyspace = KeySpaceAccum::new();
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut blknum = 0;
for _ in 0..50 {
for _ in 0..10000 {
@@ -4420,7 +4425,7 @@ mod tests {
const NUM_KEYS: usize = 1000;
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut keyspace = KeySpaceAccum::new();
@@ -4501,7 +4506,7 @@ mod tests {
const NUM_KEYS: usize = 1000;
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
let mut keyspace = KeySpaceAccum::new();
@@ -4592,7 +4597,7 @@ mod tests {
const NUM_KEYS: usize = 100;
const NUM_TLINES: usize = 50;
let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
// Track page mutation lsns across different timelines.
let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];

View File

@@ -21,7 +21,7 @@ use crate::{
};
use super::{
mgr::{GetTenantError, TenantsMap},
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
span,
timeline::delete::DeleteTimelineFlow,
@@ -33,12 +33,21 @@ pub(crate) enum DeleteTenantError {
#[error("GetTenant {0}")]
Get(#[from] GetTenantError),
#[error("Tenant not attached")]
NotAttached,
#[error("Invalid state {0}. Expected Active or Broken")]
InvalidState(TenantState),
#[error("Tenant deletion is already in progress")]
AlreadyInProgress,
#[error("Tenant map slot error {0}")]
SlotError(#[from] TenantSlotError),
#[error("Tenant map slot upsert error {0}")]
SlotUpsertError(#[from] TenantSlotUpsertError),
#[error("Timeline {0}")]
Timeline(#[from] DeleteTimelineError),
@@ -273,12 +282,12 @@ impl DeleteTenantFlow {
pub(crate) async fn run(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
tenant.set_broken(format!("{e:#}")).await;
@@ -378,7 +387,7 @@ impl DeleteTenantFlow {
guard: DeletionGuard,
tenant: &Arc<Tenant>,
preload: Option<TenantPreload>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenants: &'static std::sync::RwLock<TenantsMap>,
init_order: Option<InitializationOrder>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
@@ -405,15 +414,8 @@ impl DeleteTenantFlow {
}
async fn prepare(
tenants: &tokio::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
let m = tenants.read().await;
let tenant = m
.get(&tenant_id)
.ok_or(GetTenantError::NotFound(tenant_id))?;
tenant: &Arc<Tenant>,
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
// so at least for now allow deletions only for active tenants. TODO recheck
// Broken and Stopping is needed for retries.
@@ -447,14 +449,14 @@ impl DeleteTenantFlow {
)));
}
Ok((Arc::clone(tenant), guard))
Ok(guard)
}
fn schedule_background(
guard: OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
let tenant_id = tenant.tenant_id;
@@ -487,7 +489,7 @@ impl DeleteTenantFlow {
mut guard: OwnedMutexGuard<Self>,
conf: &PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static tokio::sync::RwLock<TenantsMap>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: &Arc<Tenant>,
) -> Result<(), DeleteTenantError> {
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -535,10 +537,18 @@ impl DeleteTenantFlow {
.await
.context("cleanup_remaining_fs_traces")?;
let mut locked = tenants.write().await;
if locked.remove(&tenant.tenant_id).is_none() {
warn!("Tenant got removed from tenants map during deletion");
};
{
let mut locked = tenants.write().unwrap();
if locked.remove(&tenant.tenant_id).is_none() {
warn!("Tenant got removed from tenants map during deletion");
};
// FIXME: we should not be modifying this from outside of mgr.rs.
// This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
crate::metrics::TENANT_MANAGER
.tenant_slots
.set(locked.len() as u64);
}
*guard = Self::Finished;

File diff suppressed because it is too large Load Diff

View File

@@ -406,4 +406,123 @@ mod tests {
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
);
}
#[test]
fn test_metadata_bincode_serde() {
let original_metadata = TimelineMetadata::new(
Lsn(0x200),
Some(Lsn(0x100)),
Some(TIMELINE_ID),
Lsn(0),
Lsn(0),
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
);
let metadata_bytes = original_metadata
.to_bytes()
.expect("Cannot create bytes array from metadata");
let metadata_bincode_be_bytes = original_metadata
.ser()
.expect("Cannot serialize the metadata");
// 8 bytes for the length of the vector
assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len());
let expected_bincode_bytes = {
let mut temp = vec![];
let len_bytes = metadata_bytes.len().to_be_bytes();
temp.extend_from_slice(&len_bytes);
temp.extend_from_slice(&metadata_bytes);
temp
};
assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes);
let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap();
// Deserialized metadata has the metadata header, which is different from the serialized one.
// Reference: TimelineMetaData::to_bytes()
let expected_metadata = {
let mut temp_metadata = original_metadata;
let body_bytes = temp_metadata
.body
.ser()
.expect("Cannot serialize the metadata body");
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
let hdr = TimelineMetadataHeader {
size: metadata_size as u16,
format_version: METADATA_FORMAT_VERSION,
checksum: crc32c::crc32c(&body_bytes),
};
temp_metadata.hdr = hdr;
temp_metadata
};
assert_eq!(deserialized_metadata, expected_metadata);
}
#[test]
fn test_metadata_bincode_serde_ensure_roundtrip() {
let original_metadata = TimelineMetadata::new(
Lsn(0x200),
Some(Lsn(0x100)),
Some(TIMELINE_ID),
Lsn(0),
Lsn(0),
Lsn(0),
// Any version will do here, so use the default
crate::DEFAULT_PG_VERSION,
);
let expected_bytes = vec![
/* bincode length encoding bytes */
0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector
/* TimelineMetadataHeader */
4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2)
/* TimelineMetadataBodyV2 */
0, 0, 0, 0, 0, 0, 2, 0, // disk_consistent_lsn (8 bytes)
1, 0, 0, 0, 0, 0, 0, 1, 0, // prev_record_lsn (9 bytes)
1, 17, 34, 51, 68, 85, 102, 119, 136, 17, 34, 51, 68, 85, 102, 119,
136, // ancestor_timeline (17 bytes)
0, 0, 0, 0, 0, 0, 0, 0, // ancestor_lsn (8 bytes)
0, 0, 0, 0, 0, 0, 0, 0, // latest_gc_cutoff_lsn (8 bytes)
0, 0, 0, 0, 0, 0, 0, 0, // initdb_lsn (8 bytes)
0, 0, 0, 15, // pg_version (4 bytes)
/* padding bytes */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
];
let metadata_ser_bytes = original_metadata.ser().unwrap();
assert_eq!(metadata_ser_bytes, expected_bytes);
let expected_metadata = {
let mut temp_metadata = original_metadata;
let body_bytes = temp_metadata
.body
.ser()
.expect("Cannot serialize the metadata body");
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
let hdr = TimelineMetadataHeader {
size: metadata_size as u16,
format_version: METADATA_FORMAT_VERSION,
checksum: crc32c::crc32c(&body_bytes),
};
temp_metadata.hdr = hdr;
temp_metadata
};
let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap();
assert_eq!(des_metadata, expected_metadata);
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1542,7 +1542,7 @@ pub fn remote_index_path(
}
/// Given the key of an index, parse out the generation part of the name
pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() {
Some(f) => f,
None => {

View File

@@ -6,7 +6,6 @@ use std::collections::HashMap;
use chrono::NaiveDateTime;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::bin_ser::SerializeError;
use crate::tenant::metadata::TimelineMetadata;
@@ -58,7 +57,6 @@ impl LayerFileMetadata {
///
/// This type needs to be backwards and forwards compatible. When changing the fields,
/// remember to add a test case for the changed version.
#[serde_as]
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct IndexPart {
/// Debugging aid describing the version of this type.
@@ -78,7 +76,6 @@ pub struct IndexPart {
// 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata.
// It's duplicated for convenience when reading the serialized structure, but is
// private because internally we would read from metadata instead.
#[serde_as(as = "DisplayFromStr")]
disk_consistent_lsn: Lsn,
#[serde(rename = "metadata_bytes")]
@@ -155,7 +152,7 @@ pub struct IndexLayerMetadata {
#[serde(default = "Generation::none")]
#[serde(skip_serializing_if = "Generation::is_none")]
pub(super) generation: Generation,
pub generation: Generation,
}
impl From<LayerFileMetadata> for IndexLayerMetadata {

View File

@@ -29,7 +29,6 @@ use tenant_size_model::{Segment, StorageModel};
/// needs. We will convert this into a StorageModel when it's time to perform
/// the calculation.
///
#[serde_with::serde_as]
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct ModelInputs {
pub segments: Vec<SegmentMeta>,
@@ -37,11 +36,9 @@ pub struct ModelInputs {
}
/// A [`Segment`], with some extra information for display purposes
#[serde_with::serde_as]
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct SegmentMeta {
pub segment: Segment,
#[serde_as(as = "serde_with::DisplayFromStr")]
pub timeline_id: TimelineId,
pub kind: LsnKind,
}
@@ -77,32 +74,22 @@ pub enum LsnKind {
/// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
/// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
#[serde_with::serde_as]
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct TimelineInputs {
#[serde_as(as = "serde_with::DisplayFromStr")]
pub timeline_id: TimelineId,
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
pub ancestor_id: Option<TimelineId>,
#[serde_as(as = "serde_with::DisplayFromStr")]
ancestor_lsn: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
last_record: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
latest_gc_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
horizon_cutoff: Lsn,
#[serde_as(as = "serde_with::DisplayFromStr")]
pitr_cutoff: Lsn,
/// Cutoff point based on GC settings
#[serde_as(as = "serde_with::DisplayFromStr")]
next_gc_cutoff: Lsn,
/// Cutoff point calculated from the user-supplied 'max_retention_period'
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
retention_param_cutoff: Option<Lsn>,
}
@@ -419,10 +406,12 @@ async fn fill_logical_sizes(
have_any_error = true;
}
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
warn!(
timeline_id=%timeline.timeline_id,
"failed to calculate logical size at {lsn}: {error:#}"
);
if !matches!(error, CalculateLogicalSizeError::Cancelled) {
warn!(
timeline_id=%timeline.timeline_id,
"failed to calculate logical size at {lsn}: {error:#}"
);
}
have_any_error = true;
}
Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {

View File

@@ -345,14 +345,19 @@ impl InMemoryLayer {
let cursor = inner.file.block_cursor();
let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
keys.sort_by_key(|k| k.0);
// Sort the keys because delta layer writer expects them sorted.
//
// NOTE: this sort can take up significant time if the layer has millions of
// keys. To speed up all the comparisons we convert the key to i128 and
// keep the value as a reference.
let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
keys.sort_unstable_by_key(|k| k.0);
let ctx = RequestContextBuilder::extend(ctx)
.page_content_kind(PageContentKind::InMemoryLayer)
.build();
for (key, vec_map) in keys.iter() {
let key = **key;
let key = Key::from_i128(*key);
// Write all page versions
for (lsn, pos) in vec_map.as_slice() {
cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;

View File

@@ -125,6 +125,7 @@ impl Layer {
let inner = Arc::new(DownloadedLayer {
owner: owner.clone(),
kind: tokio::sync::OnceCell::default(),
version: 0,
});
resident = Some(inner.clone());
@@ -163,6 +164,7 @@ impl Layer {
let inner = Arc::new(DownloadedLayer {
owner: owner.clone(),
kind: tokio::sync::OnceCell::default(),
version: 0,
});
resident = Some(inner.clone());
let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
@@ -328,42 +330,46 @@ impl Layer {
/// read with [`Layer::get_value_reconstruct_data`].
///
/// [`LayerMap::search`]: crate::tenant::layer_map::LayerMap::search
#[derive(Debug)]
enum ResidentOrWantedEvicted {
Resident(Arc<DownloadedLayer>),
WantedEvicted(Weak<DownloadedLayer>),
WantedEvicted(Weak<DownloadedLayer>, usize),
}
impl ResidentOrWantedEvicted {
fn get(&self) -> Option<Arc<DownloadedLayer>> {
fn get_and_upgrade(&mut self) -> Option<(Arc<DownloadedLayer>, bool)> {
match self {
ResidentOrWantedEvicted::Resident(strong) => Some(strong.clone()),
ResidentOrWantedEvicted::WantedEvicted(weak) => match weak.upgrade() {
ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
Some(strong) => {
LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();
Some(strong)
*self = ResidentOrWantedEvicted::Resident(strong.clone());
Some((strong, true))
}
None => None,
},
}
}
/// When eviction is first requested, drop down to holding a [`Weak`].
///
/// Returns `true` if this was the first time eviction was requested.
fn downgrade(&mut self) -> &Weak<DownloadedLayer> {
let _was_first = match self {
/// Returns `Some` if this was the first time eviction was requested. Care should be taken to
/// drop the possibly last strong reference outside of the mutex of
/// heavier_once_cell::OnceCell.
fn downgrade(&mut self) -> Option<Arc<DownloadedLayer>> {
match self {
ResidentOrWantedEvicted::Resident(strong) => {
let weak = Arc::downgrade(strong);
*self = ResidentOrWantedEvicted::WantedEvicted(weak);
// returning the weak is not useful, because the drop could had already ran with
// the replacement above, and that will take care of cleaning the Option we are in
true
let mut temp = ResidentOrWantedEvicted::WantedEvicted(weak, strong.version);
std::mem::swap(self, &mut temp);
match temp {
ResidentOrWantedEvicted::Resident(strong) => Some(strong),
ResidentOrWantedEvicted::WantedEvicted(..) => unreachable!("just swapped"),
}
}
ResidentOrWantedEvicted::WantedEvicted(_) => false,
};
match self {
ResidentOrWantedEvicted::WantedEvicted(ref weak) => weak,
_ => unreachable!("just wrote wanted evicted"),
ResidentOrWantedEvicted::WantedEvicted(..) => None,
}
}
}
@@ -398,11 +404,17 @@ struct LayerInner {
/// [`LayerInner::on_downloaded_layer_drop`].
wanted_evicted: AtomicBool,
/// Version is to make sure we will in fact only evict a file if no new download has been
/// started.
/// Version is to make sure we will only evict a specific download of a file.
///
/// Incremented for each download, stored in `DownloadedLayer::version` or
/// `ResidentOrWantedEvicted::WantedEvicted`.
version: AtomicUsize,
/// Allow subscribing to when the layer actually gets evicted.
///
/// If in future we need to implement "wait until layer instances are gone and done", carrying
/// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
/// method for "wait_gc" which will wait to this being closed.
status: tokio::sync::broadcast::Sender<Status>,
/// Counter for exponential backoff with the download
@@ -515,6 +527,14 @@ impl LayerInner {
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
.join(desc.filename().to_string());
let (inner, version) = if let Some(inner) = downloaded {
let version = inner.version;
let resident = ResidentOrWantedEvicted::Resident(inner);
(heavier_once_cell::OnceCell::new(resident), version)
} else {
(heavier_once_cell::OnceCell::default(), 0)
};
LayerInner {
conf,
path,
@@ -524,12 +544,8 @@ impl LayerInner {
access_stats,
wanted_garbage_collected: AtomicBool::new(false),
wanted_evicted: AtomicBool::new(false),
inner: if let Some(inner) = downloaded {
heavier_once_cell::OnceCell::new(ResidentOrWantedEvicted::Resident(inner))
} else {
heavier_once_cell::OnceCell::default()
},
version: AtomicUsize::new(0),
inner,
version: AtomicUsize::new(version),
status: tokio::sync::broadcast::channel(1).0,
consecutive_failures: AtomicUsize::new(0),
generation,
@@ -549,6 +565,8 @@ impl LayerInner {
}
}
/// Cancellation safe, however dropping the future and calling this method again might result
/// in a new attempt to evict OR join the previously started attempt.
pub(crate) async fn evict_and_wait(
&self,
_: &RemoteTimelineClient,
@@ -559,20 +577,22 @@ impl LayerInner {
let mut rx = self.status.subscribe();
let res =
self.wanted_evicted
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
let strong = {
match self.inner.get() {
Some(mut either) => {
self.wanted_evicted.store(true, Ordering::Relaxed);
either.downgrade()
}
None => return Err(EvictionError::NotFound),
}
};
if res.is_ok() {
if strong.is_some() {
// drop the DownloadedLayer outside of the holding the guard
drop(strong);
LAYER_IMPL_METRICS.inc_started_evictions();
}
if self.get().is_none() {
// it was not evictable in the first place
// our store to the wanted_evicted does not matter; it will be reset by next download
return Err(EvictionError::NotFound);
}
match rx.recv().await {
Ok(Status::Evicted) => Ok(()),
Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
@@ -586,7 +606,8 @@ impl LayerInner {
//
// use however late (compared to the initial expressing of wanted) as the
// "outcome" now
match self.get() {
LAYER_IMPL_METRICS.inc_broadcast_lagged();
match self.inner.get() {
Some(_) => Err(EvictionError::Downloaded),
None => Ok(()),
}
@@ -594,17 +615,19 @@ impl LayerInner {
}
}
/// Should be cancellation safe, but cancellation is troublesome together with the spawned
/// download.
/// Cancellation safe.
#[tracing::instrument(skip_all, fields(layer=%self))]
async fn get_or_maybe_download(
self: &Arc<Self>,
allow_download: bool,
ctx: Option<&RequestContext>,
) -> Result<Arc<DownloadedLayer>, DownloadError> {
let mut init_permit = None;
loop {
let download = move || async move {
let download = move |permit| async move {
// disable any scheduled but not yet running eviction deletions for this
self.version.fetch_add(1, Ordering::Relaxed);
let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
// no need to make the evict_and_wait wait for the actual download to complete
drop(self.status.send(Status::Downloaded));
@@ -623,7 +646,11 @@ impl LayerInner {
.await
.map_err(DownloadError::PreStatFailed)?;
if let Some(reason) = needs_download {
let permit = if let Some(reason) = needs_download {
if let NeedsDownload::NotFile(ft) = reason {
return Err(DownloadError::NotFile(ft));
}
// only reset this after we've decided we really need to download. otherwise it'd
// be impossible to mark cancelled downloads for eviction, like one could imagine
// we would like to do for prefetching which was not needed.
@@ -633,8 +660,6 @@ impl LayerInner {
return Err(DownloadError::NoRemoteStorage);
}
tracing::debug!(%reason, "downloading layer");
if let Some(ctx) = ctx {
self.check_expected_download(ctx)?;
}
@@ -645,16 +670,21 @@ impl LayerInner {
return Err(DownloadError::DownloadRequired);
}
self.spawn_download_and_wait(timeline).await?;
tracing::info!(%reason, "downloading on-demand");
self.spawn_download_and_wait(timeline, permit).await?
} else {
// the file is present locally, probably by a previous but cancelled call to
// get_or_maybe_download. alternatively we might be running without remote storage.
LAYER_IMPL_METRICS.inc_init_needed_no_download();
}
permit
};
let res = Arc::new(DownloadedLayer {
owner: Arc::downgrade(self),
kind: tokio::sync::OnceCell::default(),
version: next_version,
});
self.access_stats.record_residence_event(
@@ -662,19 +692,60 @@ impl LayerInner {
LayerResidenceEventReason::ResidenceChange,
);
Ok(ResidentOrWantedEvicted::Resident(res))
let waiters = self.inner.initializer_count();
if waiters > 0 {
tracing::info!(waiters, "completing the on-demand download for other tasks");
}
Ok((ResidentOrWantedEvicted::Resident(res), permit))
};
let locked = self.inner.get_or_init(download).await?;
if let Some(strong) = Self::get_or_apply_evictedness(Some(locked), &self.wanted_evicted)
{
if let Some(init_permit) = init_permit.take() {
// use the already held initialization permit because it is impossible to hit the
// below paths anymore essentially limiting the max loop iterations to 2.
let (value, init_permit) = download(init_permit).await?;
let mut guard = self.inner.set(value, init_permit);
let (strong, _upgraded) = guard
.get_and_upgrade()
.expect("init creates strong reference, we held the init permit");
return Ok(strong);
}
// the situation in which we might need to retry is that our init was ready
// immediatedly, but the DownloadedLayer had been dropped BUT failed to complete
// Self::evict_blocking
let (weak, permit) = {
let mut locked = self.inner.get_or_init(download).await?;
if let Some((strong, upgraded)) = locked.get_and_upgrade() {
if upgraded {
// when upgraded back, the Arc<DownloadedLayer> is still available, but
// previously a `evict_and_wait` was received.
self.wanted_evicted.store(false, Ordering::Relaxed);
// error out any `evict_and_wait`
drop(self.status.send(Status::Downloaded));
LAYER_IMPL_METRICS
.inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);
}
return Ok(strong);
} else {
// path to here: the evict_blocking is stuck on spawn_blocking queue.
//
// reset the contents, deactivating the eviction and causing a
// EvictionCancelled::LostToDownload or EvictionCancelled::VersionCheckFailed.
locked.take_and_deinit()
}
};
// unlock first, then drop the weak, but because upgrade failed, we
// know it cannot be a problem.
assert!(
matches!(weak, ResidentOrWantedEvicted::WantedEvicted(..)),
"unexpected {weak:?}, ResidentOrWantedEvicted::get_and_upgrade has a bug"
);
init_permit = Some(permit);
LAYER_IMPL_METRICS.inc_retried_get_or_maybe_download();
}
}
@@ -686,8 +757,8 @@ impl LayerInner {
match b {
Download => Ok(()),
Warn | Error => {
tracing::warn!(
"unexpectedly on-demand downloading remote layer {self} for task kind {:?}",
tracing::info!(
"unexpectedly on-demand downloading for task kind {:?}",
ctx.task_kind()
);
crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();
@@ -709,14 +780,17 @@ impl LayerInner {
async fn spawn_download_and_wait(
self: &Arc<Self>,
timeline: Arc<Timeline>,
) -> Result<(), DownloadError> {
permit: heavier_once_cell::InitPermit,
) -> Result<heavier_once_cell::InitPermit, DownloadError> {
let task_name = format!("download layer {}", self);
let (tx, rx) = tokio::sync::oneshot::channel();
// this is sadly needed because of task_mgr::shutdown_tasks, otherwise we cannot
// block tenant::mgr::remove_tenant_from_memory.
let this: Arc<Self> = self.clone();
crate::task_mgr::spawn(
&tokio::runtime::Handle::current(),
crate::task_mgr::TaskKind::RemoteDownloadTask,
@@ -725,6 +799,7 @@ impl LayerInner {
&task_name,
false,
async move {
let client = timeline
.remote_client
.as_ref()
@@ -746,9 +821,9 @@ impl LayerInner {
}
};
if let Err(res) = tx.send(result) {
if let Err(res) = tx.send((result, permit)) {
match res {
Ok(()) => {
(Ok(()), _) => {
// our caller is cancellation safe so this is fine; if someone
// else requests the layer, they'll find it already downloaded
// or redownload.
@@ -759,7 +834,7 @@ impl LayerInner {
tracing::info!("layer file download completed after requester had cancelled");
LAYER_IMPL_METRICS.inc_download_completed_without_requester();
},
Err(e) => {
(Err(e), _) => {
// our caller is cancellation safe, but we might be racing with
// another attempt to initialize. before we have cancellation
// token support: these attempts should converge regardless of
@@ -775,7 +850,7 @@ impl LayerInner {
.in_current_span(),
);
match rx.await {
Ok(Ok(())) => {
Ok((Ok(()), permit)) => {
if let Some(reason) = self
.needs_download()
.await
@@ -786,10 +861,12 @@ impl LayerInner {
}
self.consecutive_failures.store(0, Ordering::Relaxed);
tracing::info!("on-demand download successful");
Ok(())
Ok(permit)
}
Ok(Err(e)) => {
Ok((Err(e), _permit)) => {
// FIXME: this should be with the spawned task and be cancellation sensitive
let consecutive_failures =
self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
@@ -807,33 +884,6 @@ impl LayerInner {
}
}
/// Access the current state without waiting for the file to be downloaded.
///
/// Requires that we've initialized to state which is respective to the
/// actual residency state.
fn get(&self) -> Option<Arc<DownloadedLayer>> {
let locked = self.inner.get();
Self::get_or_apply_evictedness(locked, &self.wanted_evicted)
}
fn get_or_apply_evictedness(
guard: Option<heavier_once_cell::Guard<'_, ResidentOrWantedEvicted>>,
wanted_evicted: &AtomicBool,
) -> Option<Arc<DownloadedLayer>> {
if let Some(mut x) = guard {
if let Some(won) = x.get() {
// there are no guarantees that we will always get to observe a concurrent call
// to evict
if wanted_evicted.load(Ordering::Acquire) {
x.downgrade();
}
return Some(won);
}
}
None
}
async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
match tokio::fs::metadata(&self.path).await {
Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
@@ -853,7 +903,7 @@ impl LayerInner {
fn is_file_present_and_good_size(&self, m: &std::fs::Metadata) -> Result<(), NeedsDownload> {
// in future, this should include sha2-256 validation of the file.
if !m.is_file() {
Err(NeedsDownload::NotFile)
Err(NeedsDownload::NotFile(m.file_type()))
} else if m.len() != self.desc.file_size {
Err(NeedsDownload::WrongSize {
actual: m.len(),
@@ -867,7 +917,9 @@ impl LayerInner {
fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
let layer_file_name = self.desc.filename().file_name();
let remote = self.get().is_none();
// this is not accurate: we could have the file locally but there was a cancellation
// and now we are not in sync, or we are currently downloading it.
let remote = self.inner.get().is_none();
let access_stats = self.access_stats.as_api_model(reset);
@@ -896,7 +948,7 @@ impl LayerInner {
}
/// `DownloadedLayer` is being dropped, so it calls this method.
fn on_downloaded_layer_drop(self: Arc<LayerInner>) {
fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
let evict = self.wanted_evicted.load(Ordering::Acquire);
let can_evict = self.have_remote_client;
@@ -904,15 +956,16 @@ impl LayerInner {
if gc {
// do nothing now, only in LayerInner::drop
} else if can_evict && evict {
let version = self.version.load(Ordering::Relaxed);
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self);
let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
// downgrade for queueing, in case there's a tear down already ongoing we should not
// hold it alive.
let this = Arc::downgrade(&self);
drop(self);
// NOTE: this scope *must* never call `self.inner.get` because evict_and_wait might
// drop while the `self.inner` is being locked, leading to a deadlock.
crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
let _g = span.entered();
@@ -922,19 +975,15 @@ impl LayerInner {
LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
return;
};
this.evict_blocking(version);
match this.evict_blocking(version) {
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
}
});
}
}
fn evict_blocking(&self, version: usize) {
match self.evict_blocking0(version) {
Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
Err(reason) => LAYER_IMPL_METRICS.inc_eviction_cancelled(reason),
}
}
fn evict_blocking0(&self, version: usize) -> Result<(), EvictionCancelled> {
fn evict_blocking(&self, only_version: usize) -> Result<(), EvictionCancelled> {
// deleted or detached timeline, don't do anything.
let Some(timeline) = self.timeline.upgrade() else {
return Err(EvictionCancelled::TimelineGone);
@@ -945,32 +994,34 @@ impl LayerInner {
let _permit = {
let maybe_downloaded = self.inner.get();
if version != self.version.load(Ordering::Relaxed) {
// downloadness-state has advanced, we might no longer be the latest eviction
// work; don't do anything.
//
// this is possible to get to by having:
//
// 1. wanted_evicted.store(true)
// 2. ResidentOrWantedEvicted::downgrade
// 3. DownloadedLayer::drop
// 4. LayerInner::get_or_maybe_download
// 5. LayerInner::evict_blocking
return Err(EvictionCancelled::VersionCheckFailed);
}
// free the DownloadedLayer allocation
match maybe_downloaded.map(|mut g| g.take_and_deinit()) {
Some((taken, permit)) => {
assert!(matches!(taken, ResidentOrWantedEvicted::WantedEvicted(_)));
permit
let (_weak, permit) = match maybe_downloaded {
Some(mut guard) => {
if let ResidentOrWantedEvicted::WantedEvicted(_weak, version) = &*guard {
if *version == only_version {
guard.take_and_deinit()
} else {
// this was not for us; maybe there's another eviction job
// TODO: does it make any sense to stall here? unique versions do not
// matter, we only want to make sure not to evict a resident, which we
// are not doing.
return Err(EvictionCancelled::VersionCheckFailed);
}
} else {
return Err(EvictionCancelled::AlreadyReinitialized);
}
}
None => {
unreachable!("we do the version checking for this exact reason")
// already deinitialized, perhaps get_or_maybe_download did this and is
// currently waiting to reinitialize it
return Err(EvictionCancelled::LostToDownload);
}
}
};
permit
};
// now accesses to inner.get_or_init wait on the semaphore or the `_permit`
self.access_stats.record_residence_event(
LayerResidenceStatus::Evicted,
LayerResidenceEventReason::ResidenceChange,
@@ -1003,11 +1054,14 @@ impl LayerInner {
Ok(())
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
tracing::info!("failed to evict file from disk, it was already gone");
tracing::error!(
layer_size = %self.desc.file_size,
"failed to evict layer from disk, it was already gone (metrics will be inaccurate)"
);
Err(EvictionCancelled::FileNotFound)
}
Err(e) => {
tracing::warn!("failed to evict file from disk: {e:#}");
tracing::error!("failed to evict file from disk: {e:#}");
Err(EvictionCancelled::RemoveFailed)
}
};
@@ -1051,6 +1105,8 @@ enum DownloadError {
ContextAndConfigReallyDeniesDownloads,
#[error("downloading is really required but not allowed by this method")]
DownloadRequired,
#[error("layer path exists, but it is not a file: {0:?}")]
NotFile(std::fs::FileType),
/// Why no error here? Because it will be reported by page_service. We should had also done
/// retries already.
#[error("downloading evicted layer file failed")]
@@ -1066,7 +1122,7 @@ enum DownloadError {
#[derive(Debug, PartialEq)]
pub(crate) enum NeedsDownload {
NotFound,
NotFile,
NotFile(std::fs::FileType),
WrongSize { actual: u64, expected: u64 },
}
@@ -1074,7 +1130,7 @@ impl std::fmt::Display for NeedsDownload {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
NeedsDownload::NotFound => write!(f, "file was not found"),
NeedsDownload::NotFile => write!(f, "path is not a file"),
NeedsDownload::NotFile(ft) => write!(f, "path is not a file; {ft:?}"),
NeedsDownload::WrongSize { actual, expected } => {
write!(f, "file size mismatch {actual} vs. {expected}")
}
@@ -1085,7 +1141,10 @@ impl std::fmt::Display for NeedsDownload {
/// Existence of `DownloadedLayer` means that we have the file locally, and can later evict it.
pub(crate) struct DownloadedLayer {
owner: Weak<LayerInner>,
// Use tokio OnceCell as we do not need to deinitialize this, it'll just get dropped with the
// DownloadedLayer
kind: tokio::sync::OnceCell<anyhow::Result<LayerKind>>,
version: usize,
}
impl std::fmt::Debug for DownloadedLayer {
@@ -1093,6 +1152,7 @@ impl std::fmt::Debug for DownloadedLayer {
f.debug_struct("DownloadedLayer")
// owner omitted because it is always "Weak"
.field("kind", &self.kind)
.field("version", &self.version)
.finish()
}
}
@@ -1100,7 +1160,7 @@ impl std::fmt::Debug for DownloadedLayer {
impl Drop for DownloadedLayer {
fn drop(&mut self) {
if let Some(owner) = self.owner.upgrade() {
owner.on_downloaded_layer_drop();
owner.on_downloaded_layer_drop(self.version);
} else {
// no need to do anything, we are shutting down
}
@@ -1126,7 +1186,6 @@ impl DownloadedLayer {
"these are the same, just avoiding the upgrade"
);
// there is nothing async here, but it should be async
let res = if owner.desc.is_delta {
let summary = Some(delta_layer::Summary::expected(
owner.desc.tenant_id,
@@ -1225,6 +1284,8 @@ impl std::fmt::Debug for ResidentLayer {
impl ResidentLayer {
/// Release the eviction guard, converting back into a plain [`Layer`].
///
/// You can access the [`Layer`] also by using `as_ref`.
pub(crate) fn drop_eviction_guard(self) -> Layer {
self.into()
}
@@ -1280,7 +1341,7 @@ impl AsRef<Layer> for ResidentLayer {
}
}
/// Allow slimming down if we don't want the `2*usize` with eviction candidates?
/// Drop the eviction guard.
impl From<ResidentLayer> for Layer {
fn from(value: ResidentLayer) -> Self {
value.owner
@@ -1450,6 +1511,13 @@ impl LayerImplMetrics {
.unwrap()
.inc();
}
fn inc_broadcast_lagged(&self) {
self.rare_counters
.get_metric_with_label_values(&["broadcast_lagged"])
.unwrap()
.inc();
}
}
enum EvictionCancelled {
@@ -1458,6 +1526,11 @@ enum EvictionCancelled {
VersionCheckFailed,
FileNotFound,
RemoveFailed,
AlreadyReinitialized,
/// Not evicted because of a pending reinitialization
LostToDownload,
/// After eviction, there was a new layer access which cancelled the eviction.
UpgradedBackOnAccess,
}
impl EvictionCancelled {
@@ -1468,6 +1541,9 @@ impl EvictionCancelled {
EvictionCancelled::VersionCheckFailed => "version_check_fail",
EvictionCancelled::FileNotFound => "file_not_found",
EvictionCancelled::RemoveFailed => "remove_failed",
EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
EvictionCancelled::LostToDownload => "lost_to_download",
EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
}
}
}

View File

@@ -23,7 +23,7 @@ use tokio::{
};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::TenantTimelineId;
use utils::{id::TenantTimelineId, sync::gate::Gate};
use std::cmp::{max, min, Ordering};
use std::collections::{BinaryHeap, HashMap, HashSet};
@@ -310,6 +310,13 @@ pub struct Timeline {
/// Load or creation time information about the disk_consistent_lsn and when the loading
/// happened. Used for consumption metrics.
pub(crate) loaded_at: (Lsn, SystemTime),
/// Gate to prevent shutdown completing while I/O is still happening to this timeline's data
pub(crate) gate: Gate,
/// Cancellation token scoped to this timeline: anything doing long-running work relating
/// to the timeline should drop out when this token fires.
pub(crate) cancel: CancellationToken,
}
pub struct WalReceiverInfo {
@@ -786,7 +793,11 @@ impl Timeline {
// as an empty timeline. Also in unit tests, when we use the timeline
// as a simple key-value store, ignoring the datadir layout. Log the
// error but continue.
error!("could not compact, repartitioning keyspace failed: {err:?}");
//
// Suppress error when it's due to cancellation
if !self.cancel.is_cancelled() {
error!("could not compact, repartitioning keyspace failed: {err:?}");
}
}
};
@@ -884,7 +895,12 @@ impl Timeline {
pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
debug_assert_current_span_has_tenant_and_timeline_id();
// Signal any subscribers to our cancellation token to drop out
tracing::debug!("Cancelling CancellationToken");
self.cancel.cancel();
// prevent writes to the InMemoryLayer
tracing::debug!("Waiting for WalReceiverManager...");
task_mgr::shutdown_tasks(
Some(TaskKind::WalReceiverManager),
Some(self.tenant_id),
@@ -920,6 +936,16 @@ impl Timeline {
warn!("failed to await for frozen and flushed uploads: {e:#}");
}
}
// Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
// while doing so.
self.last_record_lsn.shutdown();
tracing::debug!("Waiting for tasks...");
task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
// Finally wait until any gate-holders are complete
self.gate.close().await;
}
pub fn set_state(&self, new_state: TimelineState) {
@@ -1048,6 +1074,11 @@ impl Timeline {
/// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
/// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
let _gate = self
.gate
.enter()
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
let Some(local_layer) = self.find_layer(layer_file_name).await else {
return Ok(None);
};
@@ -1063,9 +1094,8 @@ impl Timeline {
.as_ref()
.ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
let cancel = CancellationToken::new();
let results = self
.evict_layer_batch(remote_client, &[local_layer], &cancel)
.evict_layer_batch(remote_client, &[local_layer])
.await?;
assert_eq!(results.len(), 1);
let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
@@ -1080,15 +1110,18 @@ impl Timeline {
pub(crate) async fn evict_layers(
&self,
layers_to_evict: &[Layer],
cancel: &CancellationToken,
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
let _gate = self
.gate
.enter()
.map_err(|_| anyhow::anyhow!("Shutting down"))?;
let remote_client = self
.remote_client
.as_ref()
.context("timeline must have RemoteTimelineClient")?;
self.evict_layer_batch(remote_client, layers_to_evict, cancel)
.await
self.evict_layer_batch(remote_client, layers_to_evict).await
}
/// Evict multiple layers at once, continuing through errors.
@@ -1109,7 +1142,6 @@ impl Timeline {
&self,
remote_client: &Arc<RemoteTimelineClient>,
layers_to_evict: &[Layer],
cancel: &CancellationToken,
) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
// ensure that the layers have finished uploading
// (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
@@ -1157,7 +1189,7 @@ impl Timeline {
};
tokio::select! {
_ = cancel.cancelled() => {},
_ = self.cancel.cancelled() => {},
_ = join => {}
}
@@ -1267,6 +1299,7 @@ impl Timeline {
initial_logical_size_can_start: Option<completion::Barrier>,
initial_logical_size_attempt: Option<completion::Completion>,
state: TimelineState,
cancel: CancellationToken,
) -> Arc<Self> {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(state);
@@ -1367,6 +1400,8 @@ impl Timeline {
initial_logical_size_can_start,
initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
cancel,
gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")),
};
result.repartition_threshold =
result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2030,6 +2065,10 @@ impl Timeline {
let mut cont_lsn = Lsn(request_lsn.0 + 1);
'outer: loop {
if self.cancel.is_cancelled() {
return Err(PageReconstructError::Cancelled);
}
// The function should have updated 'state'
//info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
match result {
@@ -2936,13 +2975,10 @@ struct CompactLevel0Phase1StatsBuilder {
new_deltas_size: Option<u64>,
}
#[serde_as]
#[derive(serde::Serialize)]
struct CompactLevel0Phase1Stats {
version: u64,
#[serde_as(as = "serde_with::DisplayFromStr")]
tenant_id: TenantId,
#[serde_as(as = "serde_with::DisplayFromStr")]
timeline_id: TimelineId,
read_lock_acquisition_micros: RecordedDuration,
read_lock_held_spawn_blocking_startup_micros: RecordedDuration,
@@ -4369,25 +4405,10 @@ mod tests {
.expect("should had been resident")
.drop_eviction_guard();
let cancel = tokio_util::sync::CancellationToken::new();
let batch = [layer];
let first = {
let cancel = cancel.child_token();
async {
let cancel = cancel;
timeline
.evict_layer_batch(&rc, &batch, &cancel)
.await
.unwrap()
}
};
let second = async {
timeline
.evict_layer_batch(&rc, &batch, &cancel)
.await
.unwrap()
};
let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
let (first, second) = tokio::join!(first, second);

View File

@@ -17,6 +17,7 @@ use crate::{
deletion_queue::DeletionQueueClient,
task_mgr::{self, TaskKind},
tenant::{
debug_assert_current_span_has_tenant_and_timeline_id,
metadata::TimelineMetadata,
remote_timeline_client::{
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
@@ -30,6 +31,11 @@ use super::{Timeline, TimelineResources};
/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
debug_assert_current_span_has_tenant_and_timeline_id();
// Notify any timeline work to drop out of loops/requests
tracing::debug!("Cancelling CancellationToken");
timeline.cancel.cancel();
// Stop the walreceiver first.
debug!("waiting for wal receiver to shutdown");
let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
@@ -74,6 +80,11 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
"failpoint: timeline-delete-before-index-deleted-at"
))?
});
tracing::debug!("Waiting for gate...");
timeline.gate.close().await;
tracing::debug!("Shutdown complete");
Ok(())
}

View File

@@ -277,10 +277,7 @@ impl Timeline {
Some(c) => c,
};
let results = match self
.evict_layer_batch(remote_client, &candidates, cancel)
.await
{
let results = match self.evict_layer_batch(remote_client, &candidates).await {
Err(pre_err) => {
stats.errors += candidates.len();
error!("could not do any evictions: {pre_err:#}");
@@ -344,20 +341,7 @@ impl Timeline {
// Make one of the tenant's timelines draw the short straw and run the calculation.
// The others wait until the calculation is done so that they take into account the
// imitated accesses that the winner made.
//
// It is critical we are responsive to cancellation here. Otherwise, we deadlock with
// tenant deletion (holds TENANTS in read mode) any other task that attempts to
// acquire TENANTS in write mode before we here call get_tenant.
// See https://github.com/neondatabase/neon/issues/5284.
let res = tokio::select! {
_ = cancel.cancelled() => {
return ControlFlow::Break(());
}
res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
res
}
};
let tenant = match res {
let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
Ok(t) => t,
Err(_) => {
return ControlFlow::Break(());

View File

@@ -426,7 +426,7 @@ impl ConnectionManagerState {
timeline,
new_sk.wal_source_connconf,
events_sender,
cancellation,
cancellation.clone(),
connect_timeout,
ctx,
node_id,
@@ -447,7 +447,14 @@ impl ConnectionManagerState {
}
WalReceiverError::Other(e) => {
// give out an error to have task_mgr give it a really verbose logging
Err(e).context("walreceiver connection handling failure")
if cancellation.is_cancelled() {
// Ideally we would learn about this via some path other than Other, but
// that requires refactoring all the intermediate layers of ingest code
// that only emit anyhow::Error
Ok(())
} else {
Err(e).context("walreceiver connection handling failure")
}
}
}
}

View File

@@ -19,6 +19,7 @@ use std::io::{Error, ErrorKind, Seek, SeekFrom};
use std::os::unix::fs::FileExt;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{RwLock, RwLockWriteGuard};
use utils::fs_ext;
///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
@@ -173,37 +174,78 @@ impl OpenFiles {
}
}
#[derive(Debug, thiserror::Error)]
pub enum CrashsafeOverwriteError {
#[error("final path has no parent dir")]
FinalPathHasNoParentDir,
#[error("remove tempfile")]
RemovePreviousTempfile(#[source] std::io::Error),
#[error("create tempfile")]
CreateTempfile(#[source] std::io::Error),
#[error("write tempfile")]
WriteContents(#[source] std::io::Error),
#[error("sync tempfile")]
SyncTempfile(#[source] std::io::Error),
#[error("rename tempfile to final path")]
RenameTempfileToFinalPath(#[source] std::io::Error),
#[error("open final path parent dir")]
OpenFinalPathParentDir(#[source] std::io::Error),
#[error("sync final path parent dir")]
SyncFinalPathParentDir(#[source] std::io::Error),
/// Identify error types that should alwways terminate the process. Other
/// error types may be elegible for retry.
pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool {
use nix::errno::Errno::*;
match e.raw_os_error().map(nix::errno::from_i32) {
Some(EIO) => {
// Terminate on EIO because we no longer trust the device to store
// data safely, or to uphold persistence guarantees on fsync.
true
}
Some(EROFS) => {
// Terminate on EROFS because a filesystem is usually remounted
// readonly when it has experienced some critical issue, so the same
// logic as EIO applies.
true
}
Some(EACCES) => {
// Terminate on EACCESS because we should always have permissions
// for our own data dir: if we don't, then we can't do our job and
// need administrative intervention to fix permissions. Terminating
// is the best way to make sure we stop cleanly rather than going
// into infinite retry loops, and will make it clear to the outside
// world that we need help.
true
}
_ => {
// Treat all other local file I/O errors are retryable. This includes:
// - ENOSPC: we stay up and wait for eviction to free some space
// - EINVAL, EBADF, EBADFD: this is a code bug, not a filesystem/hardware issue
// - WriteZero, Interrupted: these are used internally VirtualFile
false
}
}
}
impl CrashsafeOverwriteError {
/// Returns true iff the new contents are durably stored.
pub fn are_new_contents_durable(&self) -> bool {
/// Call this when the local filesystem gives us an error with an external
/// cause: this includes EIO, EROFS, and EACCESS: all these indicate either
/// bad storage or bad configuration, and we can't fix that from inside
/// a running process.
pub(crate) fn on_fatal_io_error(e: &std::io::Error, context: &str) -> ! {
tracing::error!("Fatal I/O error: {e}: {context})");
std::process::abort();
}
pub(crate) trait MaybeFatalIo<T> {
fn maybe_fatal_err(self, context: &str) -> std::io::Result<T>;
fn fatal_err(self, context: &str) -> T;
}
impl<T> MaybeFatalIo<T> for std::io::Result<T> {
/// Terminate the process if the result is an error of a fatal type, else pass it through
///
/// This is appropriate for writes, where we typically want to die on EIO/ACCES etc, but
/// not on ENOSPC.
fn maybe_fatal_err(self, context: &str) -> std::io::Result<T> {
if let Err(e) = &self {
if is_fatal_io_error(e) {
on_fatal_io_error(e, context);
}
}
self
}
/// Terminate the process on any I/O error.
///
/// This is appropriate for reads on files that we know exist: they should always work.
fn fatal_err(self, context: &str) -> T {
match self {
Self::FinalPathHasNoParentDir => false,
Self::RemovePreviousTempfile(_) => false,
Self::CreateTempfile(_) => false,
Self::WriteContents(_) => false,
Self::SyncTempfile(_) => false,
Self::RenameTempfileToFinalPath(_) => false,
Self::OpenFinalPathParentDir(_) => false,
Self::SyncFinalPathParentDir(_) => true,
Ok(v) => v,
Err(e) => {
on_fatal_io_error(&e, context);
}
}
}
}
@@ -284,15 +326,13 @@ impl VirtualFile {
final_path: &Utf8Path,
tmp_path: &Utf8Path,
content: &[u8],
) -> Result<(), CrashsafeOverwriteError> {
) -> std::io::Result<()> {
let Some(final_path_parent) = final_path.parent() else {
return Err(CrashsafeOverwriteError::FinalPathHasNoParentDir);
return Err(std::io::Error::from_raw_os_error(
nix::errno::Errno::EINVAL as i32,
));
};
match std::fs::remove_file(tmp_path) {
Ok(()) => {}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => return Err(CrashsafeOverwriteError::RemovePreviousTempfile(e)),
}
std::fs::remove_file(tmp_path).or_else(fs_ext::ignore_not_found)?;
let mut file = Self::open_with_options(
tmp_path,
OpenOptions::new()
@@ -301,31 +341,20 @@ impl VirtualFile {
// we bail out instead of causing damage.
.create_new(true),
)
.await
.map_err(CrashsafeOverwriteError::CreateTempfile)?;
file.write_all(content)
.await
.map_err(CrashsafeOverwriteError::WriteContents)?;
file.sync_all()
.await
.map_err(CrashsafeOverwriteError::SyncTempfile)?;
.await?;
file.write_all(content).await?;
file.sync_all().await?;
drop(file); // before the rename, that's important!
// renames are atomic
std::fs::rename(tmp_path, final_path)
.map_err(CrashsafeOverwriteError::RenameTempfileToFinalPath)?;
std::fs::rename(tmp_path, final_path)?;
// Only open final path parent dirfd now, so that this operation only
// ever holds one VirtualFile fd at a time. That's important because
// the current `find_victim_slot` impl might pick the same slot for both
// VirtualFile., and it eventually does a blocking write lock instead of
// try_lock.
let final_parent_dirfd =
Self::open_with_options(final_path_parent, OpenOptions::new().read(true))
.await
.map_err(CrashsafeOverwriteError::OpenFinalPathParentDir)?;
final_parent_dirfd
.sync_all()
.await
.map_err(CrashsafeOverwriteError::SyncFinalPathParentDir)?;
Self::open_with_options(final_path_parent, OpenOptions::new().read(true)).await?;
final_parent_dirfd.sync_all().await?;
Ok(())
}

View File

@@ -857,7 +857,8 @@ impl WalRedoProcess {
let in_revents = stdin_pollfds[0].revents().unwrap();
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
nwrite += proc.stdin.write(&writebuf[nwrite..])?;
} else if in_revents.contains(PollFlags::POLLHUP) {
}
if in_revents.contains(PollFlags::POLLHUP) {
// We still have more data to write, but the process closed the pipe.
anyhow::bail!("WAL redo process closed its stdin unexpectedly");
}
@@ -907,7 +908,8 @@ impl WalRedoProcess {
let out_revents = stdout_pollfds[0].revents().unwrap();
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
nresult += output.stdout.read(&mut resultbuf[nresult..])?;
} else if out_revents.contains(PollFlags::POLLHUP) {
}
if out_revents.contains(PollFlags::POLLHUP) {
anyhow::bail!("WAL redo process closed its stdout unexpectedly");
}
}

View File

@@ -88,7 +88,7 @@ static void StartProposerReplication(WalProposer *wp, StartReplicationCmd *cmd);
static void WalSndLoop(WalProposer *wp);
static void XLogBroadcastWalProposer(WalProposer *wp);
static void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
static void XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr);
static void XLogWalPropClose(XLogRecPtr recptr);
static void
@@ -1241,7 +1241,7 @@ WalProposerRecovery(Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XL
rec_end_lsn = rec_start_lsn + len - XLOG_HDR_SIZE;
/* write WAL to disk */
XLogWalPropWrite(&buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
XLogWalPropWrite(sk->wp, &buf[XLOG_HDR_SIZE], len - XLOG_HDR_SIZE, rec_start_lsn);
ereport(DEBUG1,
(errmsg("Recover message %X/%X length %d",
@@ -1283,11 +1283,24 @@ static XLogSegNo walpropSegNo = 0;
* Write XLOG data to disk.
*/
static void
XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr)
XLogWalPropWrite(WalProposer *wp, char *buf, Size nbytes, XLogRecPtr recptr)
{
int startoff;
int byteswritten;
/*
* Apart from walproposer, basebackup LSN page is also written out by
* postgres itself which writes WAL only in pages, and in basebackup it is
* inherently dummy (only safekeepers have historic WAL). Update WAL buffers
* here to avoid dummy page overwriting correct one we download here. Ugly,
* but alternatives are about the same ugly. We won't need that if we switch
* to on-demand WAL download from safekeepers, without writing to disk.
*
* https://github.com/neondatabase/neon/issues/5749
*/
if (!wp->config->syncSafekeepers)
XLogUpdateWalBuffers(buf, recptr, nbytes);
while (nbytes > 0)
{
int segbytes;

View File

@@ -13,6 +13,7 @@ pub struct ConsoleError {
#[derive(Deserialize)]
pub struct GetRoleSecret {
pub role_secret: Box<str>,
pub allowed_ips: Option<Vec<Box<str>>>,
}
// Manually implement debug to omit sensitive info.
@@ -187,4 +188,31 @@ mod tests {
Ok(())
}
#[test]
fn parse_wake_compute() -> anyhow::Result<()> {
let json = json!({
"address": "0.0.0.0",
"aux": dummy_aux(),
});
let _: WakeCompute = serde_json::from_str(&json.to_string())?;
Ok(())
}
#[test]
fn parse_get_role_secret() -> anyhow::Result<()> {
// Empty `allowed_ips` field.
let json = json!({
"role_secret": "secret",
});
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
// Empty `allowed_ips` field.
let json = json!({
"role_secret": "secret",
"allowed_ips": ["8.8.8.8"],
});
let _: GetRoleSecret = serde_json::from_str(&json.to_string())?;
Ok(())
}
}

View File

@@ -49,7 +49,7 @@ impl Api {
.endpoint
.get("proxy_get_role_secret")
.header("X-Request-ID", &request_id)
.header("Authorization", &self.jwt)
.header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", extra.session_id)])
.query(&[
("application_name", extra.application_name),
@@ -94,7 +94,7 @@ impl Api {
.endpoint
.get("proxy_wake_compute")
.header("X-Request-ID", &request_id)
.header("Authorization", &self.jwt)
.header("Authorization", format!("Bearer {}", &self.jwt))
.query(&[("session_id", extra.session_id)])
.query(&[
("application_name", extra.application_name),

View File

@@ -3,10 +3,9 @@
use std::ffi::CStr;
pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
let pos = bytes.iter().position(|&x| x == 0)?;
let (cstr, other) = bytes.split_at(pos + 1);
// SAFETY: we've already checked that there's a terminator
Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other))
let cstr = CStr::from_bytes_until_nul(bytes).ok()?;
let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len());
Some((cstr, other))
}
/// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.

View File

@@ -470,30 +470,26 @@ async fn query_to_json<T: GenericClient>(
}
.and_then(|s| s.parse::<i64>().ok());
let fields = if !rows.is_empty() {
rows[0]
.columns()
.iter()
.map(|c| {
json!({
"name": Value::String(c.name().to_owned()),
"dataTypeID": Value::Number(c.type_().oid().into()),
"tableID": c.table_oid(),
"columnID": c.column_id(),
"dataTypeSize": c.type_size(),
"dataTypeModifier": c.type_modifier(),
"format": "text",
})
})
.collect::<Vec<_>>()
} else {
Vec::new()
};
let mut fields = vec![];
let mut columns = vec![];
for c in row_stream.columns() {
fields.push(json!({
"name": Value::String(c.name().to_owned()),
"dataTypeID": Value::Number(c.type_().oid().into()),
"tableID": c.table_oid(),
"columnID": c.column_id(),
"dataTypeSize": c.type_size(),
"dataTypeModifier": c.type_modifier(),
"format": "text",
}));
columns.push(client.get_type(c.type_oid()).await?);
}
// convert rows to JSON
let rows = rows
.iter()
.map(|row| pg_text_row_to_json(row, raw_output, array_mode))
.map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode))
.collect::<Result<Vec<_>, _>>()?;
// resulting JSON format is based on the format of node-postgres result
@@ -514,22 +510,28 @@ async fn query_to_json<T: GenericClient>(
//
pub fn pg_text_row_to_json(
row: &Row,
columns: &[Type],
raw_output: bool,
array_mode: bool,
) -> Result<Value, anyhow::Error> {
let iter = row.columns().iter().enumerate().map(|(i, column)| {
let name = column.name();
let pg_value = row.as_text(i)?;
let json_value = if raw_output {
match pg_value {
Some(v) => Value::String(v.to_string()),
None => Value::Null,
}
} else {
pg_text_to_json(pg_value, column.type_())?
};
Ok((name.to_string(), json_value))
});
let iter = row
.columns()
.iter()
.zip(columns)
.enumerate()
.map(|(i, (column, typ))| {
let name = column.name();
let pg_value = row.as_text(i)?;
let json_value = if raw_output {
match pg_value {
Some(v) => Value::String(v.to_string()),
None => Value::Null,
}
} else {
pg_text_to_json(pg_value, typ)?
};
Ok((name.to_string(), json_value))
});
if array_mode {
// drop keys and aggregate into array

View File

@@ -33,6 +33,7 @@ reqwest = { workspace = true, default-features = false, features = ["rustls-tls"
aws-config = { workspace = true, default-features = false, features = ["rustls", "credentials-sso"] }
pageserver = { path = "../pageserver" }
remote_storage = { path = "../libs/remote_storage" }
tracing.workspace = true
tracing-subscriber.workspace = true

View File

@@ -1,13 +1,18 @@
use std::collections::HashSet;
use anyhow::Context;
use aws_sdk_s3::Client;
use aws_sdk_s3::{types::ObjectIdentifier, Client};
use tracing::{error, info, warn};
use utils::generation::Generation;
use crate::cloud_admin_api::BranchData;
use crate::{download_object_with_retries, list_objects_with_retries, RootTarget};
use crate::metadata_stream::stream_listing;
use crate::{download_object_with_retries, RootTarget};
use futures_util::{pin_mut, StreamExt};
use pageserver::tenant::remote_timeline_client::parse_remote_index_path;
use pageserver::tenant::storage_layer::LayerFileName;
use pageserver::tenant::IndexPart;
use remote_storage::RemotePath;
use utils::id::TenantTimelineId;
pub(crate) struct TimelineAnalysis {
@@ -68,6 +73,7 @@ pub(crate) async fn branch_cleanup_and_check_errors(
match s3_data.blob_data {
BlobDataParseResult::Parsed {
index_part,
index_part_generation,
mut s3_layers,
} => {
if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) {
@@ -107,33 +113,62 @@ pub(crate) async fn branch_cleanup_and_check_errors(
))
}
if !s3_layers.remove(&layer) {
let layer_map_key = (layer, metadata.generation);
if !s3_layers.remove(&layer_map_key) {
// FIXME: this will emit false positives if an index was
// uploaded concurrently with our scan. To make this check
// correct, we need to try sending a HEAD request for the
// layer we think is missing.
result.errors.push(format!(
"index_part.json contains a layer {} that is not present in S3",
layer.file_name(),
"index_part.json contains a layer {}{} that is not present in remote storage",
layer_map_key.0.file_name(),
layer_map_key.1.get_suffix()
))
}
}
if !s3_layers.is_empty() {
let orphan_layers: Vec<(LayerFileName, Generation)> = s3_layers
.into_iter()
.filter(|(_layer_name, gen)|
// A layer is only considered orphaned if it has a generation below
// the index. If the generation is >= the index, then the layer may
// be an upload from a running pageserver, or even an upload from
// a new generation that didn't upload an index yet.
//
// Even so, a layer that is not referenced by the index could just
// be something enqueued for deletion, so while this check is valid
// for indicating that a layer is garbage, it is not an indicator
// of a problem.
gen < &index_part_generation)
.collect();
if !orphan_layers.is_empty() {
result.errors.push(format!(
"index_part.json does not contain layers from S3: {:?}",
s3_layers
orphan_layers
.iter()
.map(|layer_name| layer_name.file_name())
.map(|(layer_name, gen)| format!(
"{}{}",
layer_name.file_name(),
gen.get_suffix()
))
.collect::<Vec<_>>(),
));
result
.garbage_keys
.extend(s3_layers.iter().map(|layer_name| {
result.garbage_keys.extend(orphan_layers.iter().map(
|(layer_name, layer_gen)| {
let mut key = s3_root.timeline_root(id).prefix_in_bucket;
let delimiter = s3_root.delimiter();
if !key.ends_with(delimiter) {
key.push_str(delimiter);
}
key.push_str(&layer_name.file_name());
key.push_str(&format!(
"{}{}",
&layer_name.file_name(),
layer_gen.get_suffix()
));
key
}));
},
));
}
}
BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend(
@@ -178,70 +213,97 @@ pub(crate) struct S3TimelineBlobData {
pub(crate) enum BlobDataParseResult {
Parsed {
index_part: IndexPart,
s3_layers: HashSet<LayerFileName>,
index_part_generation: Generation,
s3_layers: HashSet<(LayerFileName, Generation)>,
},
Incorrect(Vec<String>),
}
fn parse_layer_object_name(name: &str) -> Result<(LayerFileName, Generation), String> {
match name.rsplit_once('-') {
// FIXME: this is gross, just use a regex?
Some((layer_filename, gen)) if gen.len() == 8 => {
let layer = layer_filename.parse::<LayerFileName>()?;
let gen =
Generation::parse_suffix(gen).ok_or("Malformed generation suffix".to_string())?;
Ok((layer, gen))
}
_ => Ok((name.parse::<LayerFileName>()?, Generation::none())),
}
}
pub(crate) async fn list_timeline_blobs(
s3_client: &Client,
id: TenantTimelineId,
s3_root: &RootTarget,
) -> anyhow::Result<S3TimelineBlobData> {
let mut s3_layers = HashSet::new();
let mut index_part_object = None;
let timeline_dir_target = s3_root.timeline_root(&id);
let mut continuation_token = None;
let mut errors = Vec::new();
let mut keys_to_remove = Vec::new();
loop {
let fetch_response =
list_objects_with_retries(s3_client, &timeline_dir_target, continuation_token.clone())
.await?;
let mut timeline_dir_target = s3_root.timeline_root(&id);
timeline_dir_target.delimiter = String::new();
let subdirectories = fetch_response.common_prefixes().unwrap_or_default();
if !subdirectories.is_empty() {
errors.push(format!(
"S3 list response should not contain any subdirectories, but got {subdirectories:?}"
));
}
let mut index_parts: Vec<ObjectIdentifier> = Vec::new();
for (object, key) in fetch_response
.contents()
.unwrap_or_default()
.iter()
.filter_map(|object| Some((object, object.key()?)))
{
let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
match blob_name {
Some("index_part.json") => index_part_object = Some(object.clone()),
Some(maybe_layer_name) => match maybe_layer_name.parse::<LayerFileName>() {
Ok(new_layer) => {
s3_layers.insert(new_layer);
}
Err(e) => {
errors.push(
format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
);
keys_to_remove.push(key.to_string());
}
},
None => {
errors.push(format!("S3 list response got an object with odd key {key}"));
let stream = stream_listing(s3_client, &timeline_dir_target);
pin_mut!(stream);
while let Some(obj) = stream.next().await {
let obj = obj?;
let key = match obj.key() {
Some(k) => k,
None => continue,
};
let blob_name = key.strip_prefix(&timeline_dir_target.prefix_in_bucket);
match blob_name {
Some(name) if name.starts_with("index_part.json") => {
tracing::info!("Index key {key}");
index_parts.push(obj)
}
Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) {
Ok((new_layer, gen)) => {
tracing::info!("Parsed layer key: {} {:?}", new_layer, gen);
s3_layers.insert((new_layer, gen));
}
Err(e) => {
tracing::info!("Error parsing key {maybe_layer_name}");
errors.push(
format!("S3 list response got an object with key {key} that is not a layer name: {e}"),
);
keys_to_remove.push(key.to_string());
}
},
None => {
tracing::info!("Peculiar key {}", key);
errors.push(format!("S3 list response got an object with odd key {key}"));
keys_to_remove.push(key.to_string());
}
}
match fetch_response.next_continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
// Choose the index_part with the highest generation
let (index_part_object, index_part_generation) = match index_parts
.iter()
.filter_map(|k| {
let key = k.key().unwrap();
// Stripping the index key to the last part, because RemotePath doesn't
// like absolute paths, and depending on prefix_in_bucket it's possible
// for the keys we read back to start with a slash.
let basename = key.rsplit_once('/').unwrap().1;
parse_remote_index_path(RemotePath::from_string(basename).unwrap()).map(|g| (k, g))
})
.max_by_key(|i| i.1)
.map(|(k, g)| (k.clone(), g))
{
Some((key, gen)) => (Some(key), gen),
None => {
// Legacy/missing case: one or zero index parts, which did not have a generation
(index_parts.pop(), Generation::none())
}
};
if index_part_object.is_none() {
errors.push("S3 list response got no index_part.json file".to_string());
}
@@ -261,6 +323,7 @@ pub(crate) async fn list_timeline_blobs(
return Ok(S3TimelineBlobData {
blob_data: BlobDataParseResult::Parsed {
index_part,
index_part_generation,
s3_layers,
},
keys_to_remove,

View File

@@ -5,6 +5,7 @@ use std::time::Duration;
use chrono::{DateTime, Utc};
use hex::FromHex;
use pageserver::tenant::Tenant;
use reqwest::{header, Client, StatusCode, Url};
use serde::Deserialize;
use tokio::sync::Semaphore;
@@ -118,13 +119,18 @@ fn from_nullable_id<'de, D>(deserializer: D) -> Result<TenantId, D::Error>
where
D: serde::de::Deserializer<'de>,
{
let id_str = String::deserialize(deserializer)?;
if id_str.is_empty() {
// This is a bogus value, but for the purposes of the scrubber all that
// matters is that it doesn't collide with any real IDs.
Ok(TenantId::from([0u8; 16]))
if deserializer.is_human_readable() {
let id_str = String::deserialize(deserializer)?;
if id_str.is_empty() {
// This is a bogus value, but for the purposes of the scrubber all that
// matters is that it doesn't collide with any real IDs.
Ok(TenantId::from([0u8; 16]))
} else {
TenantId::from_hex(&id_str).map_err(|e| serde::de::Error::custom(format!("{e}")))
}
} else {
TenantId::from_hex(&id_str).map_err(|e| serde::de::Error::custom(format!("{e}")))
let id_arr = <[u8; 16]>::deserialize(deserializer)?;
Ok(TenantId::from(id_arr))
}
}
@@ -153,7 +159,6 @@ pub struct ProjectData {
pub maintenance_set: Option<String>,
}
#[serde_with::serde_as]
#[derive(Debug, serde::Deserialize)]
pub struct BranchData {
pub id: BranchId,
@@ -161,12 +166,10 @@ pub struct BranchData {
pub updated_at: DateTime<Utc>,
pub name: String,
pub project_id: ProjectId,
#[serde_as(as = "serde_with::DisplayFromStr")]
pub timeline_id: TimelineId,
#[serde(default)]
pub parent_id: Option<BranchId>,
#[serde(default)]
#[serde_as(as = "Option<serde_with::DisplayFromStr>")]
pub parent_lsn: Option<Lsn>,
pub default: bool,
pub deleted: bool,

View File

@@ -34,6 +34,9 @@ const CLOUD_ADMIN_API_TOKEN_ENV_VAR: &str = "CLOUD_ADMIN_API_TOKEN";
#[derive(Debug, Clone)]
pub struct S3Target {
pub bucket_name: String,
/// This `prefix_in_bucket` is only equal to the PS/SK config of the same
/// name for the RootTarget: other instances of S3Target will have prefix_in_bucket
/// with extra parts.
pub prefix_in_bucket: String,
pub delimiter: String,
}
@@ -77,9 +80,13 @@ impl Display for NodeKind {
impl S3Target {
pub fn with_sub_segment(&self, new_segment: &str) -> Self {
let mut new_self = self.clone();
let _ = new_self.prefix_in_bucket.pop();
new_self.prefix_in_bucket =
[&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
if new_self.prefix_in_bucket.is_empty() {
new_self.prefix_in_bucket = format!("/{}/", new_segment);
} else {
let _ = new_self.prefix_in_bucket.pop();
new_self.prefix_in_bucket =
[&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter);
}
new_self
}
}
@@ -91,10 +98,10 @@ pub enum RootTarget {
}
impl RootTarget {
pub fn tenants_root(&self) -> &S3Target {
pub fn tenants_root(&self) -> S3Target {
match self {
Self::Pageserver(root) => root,
Self::Safekeeper(root) => root,
Self::Pageserver(root) => root.with_sub_segment(TENANTS_SEGMENT_NAME),
Self::Safekeeper(root) => root.with_sub_segment("wal"),
}
}
@@ -133,6 +140,7 @@ impl RootTarget {
pub struct BucketConfig {
pub region: String,
pub bucket: String,
pub prefix_in_bucket: Option<String>,
/// Use SSO if this is set, else rely on AWS_* environment vars
pub sso_account_id: Option<String>,
@@ -155,10 +163,12 @@ impl BucketConfig {
let sso_account_id = env::var("SSO_ACCOUNT_ID").ok();
let region = env::var("REGION").context("'REGION' param retrieval")?;
let bucket = env::var("BUCKET").context("'BUCKET' param retrieval")?;
let prefix_in_bucket = env::var("BUCKET_PREFIX").ok();
Ok(Self {
region,
bucket,
prefix_in_bucket,
sso_account_id,
})
}
@@ -191,14 +201,14 @@ pub fn init_logging(file_name: &str) -> WorkerGuard {
.with_target(false)
.with_ansi(false)
.with_writer(file_writer);
let stdout_logs = fmt::Layer::new()
.with_ansi(std::io::stdout().is_terminal())
let stderr_logs = fmt::Layer::new()
.with_ansi(std::io::stderr().is_terminal())
.with_target(false)
.with_writer(std::io::stdout);
.with_writer(std::io::stderr);
tracing_subscriber::registry()
.with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
.with(file_logs)
.with(stdout_logs)
.with(stderr_logs)
.init();
guard
@@ -250,15 +260,20 @@ fn init_remote(
let bucket_region = Region::new(bucket_config.region);
let delimiter = "/".to_string();
let s3_client = Arc::new(init_s3_client(bucket_config.sso_account_id, bucket_region));
let s3_root = match node_kind {
NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: ["pageserver", "v1", TENANTS_SEGMENT_NAME, ""].join(&delimiter),
prefix_in_bucket: bucket_config
.prefix_in_bucket
.unwrap_or("pageserver/v1".to_string()),
delimiter,
}),
NodeKind::Safekeeper => RootTarget::Safekeeper(S3Target {
bucket_name: bucket_config.bucket,
prefix_in_bucket: ["safekeeper", "v1", "wal", ""].join(&delimiter),
prefix_in_bucket: bucket_config
.prefix_in_bucket
.unwrap_or("safekeeper/v1".to_string()),
delimiter,
}),
};

View File

@@ -31,7 +31,10 @@ enum Command {
#[arg(short, long, default_value_t = PurgeMode::DeletedOnly)]
mode: PurgeMode,
},
ScanMetadata {},
ScanMetadata {
#[arg(short, long, default_value_t = false)]
json: bool,
},
}
#[tokio::main]
@@ -54,13 +57,17 @@ async fn main() -> anyhow::Result<()> {
));
match cli.command {
Command::ScanMetadata {} => match scan_metadata(bucket_config).await {
Command::ScanMetadata { json } => match scan_metadata(bucket_config).await {
Err(e) => {
tracing::error!("Failed: {e}");
Err(e)
}
Ok(summary) => {
println!("{}", summary.summary_string());
if json {
println!("{}", serde_json::to_string(&summary).unwrap())
} else {
println!("{}", summary.summary_string());
}
if summary.is_fatal() {
Err(anyhow::anyhow!("Fatal scrub errors detected"))
} else {

View File

@@ -13,10 +13,10 @@ pub fn stream_tenants<'a>(
) -> impl Stream<Item = anyhow::Result<TenantId>> + 'a {
try_stream! {
let mut continuation_token = None;
let tenants_target = target.tenants_root();
loop {
let tenants_target = target.tenants_root();
let fetch_response =
list_objects_with_retries(s3_client, tenants_target, continuation_token.clone()).await?;
list_objects_with_retries(s3_client, &tenants_target, continuation_token.clone()).await?;
let new_entry_ids = fetch_response
.common_prefixes()

View File

@@ -10,8 +10,10 @@ use aws_sdk_s3::Client;
use futures_util::{pin_mut, StreamExt, TryStreamExt};
use histogram::Histogram;
use pageserver::tenant::IndexPart;
use serde::Serialize;
use utils::id::TenantTimelineId;
#[derive(Serialize)]
pub struct MetadataSummary {
count: usize,
with_errors: HashSet<TenantTimelineId>,
@@ -25,7 +27,9 @@ pub struct MetadataSummary {
}
/// A histogram plus minimum and maximum tracking
#[derive(Serialize)]
struct MinMaxHisto {
#[serde(skip)]
histo: Histogram,
min: u64,
max: u64,
@@ -109,6 +113,7 @@ impl MetadataSummary {
self.count += 1;
if let BlobDataParseResult::Parsed {
index_part,
index_part_generation: _,
s3_layers: _,
} = &data.blob_data
{

View File

@@ -47,6 +47,7 @@ pq_proto.workspace = true
remote_storage.workspace = true
safekeeper_api.workspace = true
storage_broker.workspace = true
tokio-stream.workspace = true
utils.workspace = true
workspace_hack.workspace = true

View File

@@ -13,7 +13,7 @@ use utils::{
};
/// Persistent consensus state of the acceptor.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct AcceptorStateV1 {
/// acceptor's last term it voted for (advanced in 1 phase)
term: Term,
@@ -21,7 +21,7 @@ struct AcceptorStateV1 {
epoch: Term,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct SafeKeeperStateV1 {
/// persistent acceptor state
acceptor_state: AcceptorStateV1,
@@ -50,7 +50,7 @@ pub struct ServerInfoV2 {
pub wal_seg_size: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SafeKeeperStateV2 {
/// persistent acceptor state
pub acceptor_state: AcceptorState,
@@ -81,7 +81,7 @@ pub struct ServerInfoV3 {
pub wal_seg_size: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SafeKeeperStateV3 {
/// persistent acceptor state
pub acceptor_state: AcceptorState,
@@ -101,7 +101,7 @@ pub struct SafeKeeperStateV3 {
pub wal_start_lsn: Lsn,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SafeKeeperStateV4 {
#[serde(with = "hex")]
pub tenant_id: TenantId,
@@ -264,3 +264,245 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
}
bail!("unsupported safekeeper control file version {}", version)
}
#[cfg(test)]
mod tests {
use std::str::FromStr;
use utils::{id::NodeId, Hex};
use crate::safekeeper::PersistedPeerInfo;
use super::*;
#[test]
fn roundtrip_v1() {
let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
let state = SafeKeeperStateV1 {
acceptor_state: AcceptorStateV1 {
term: 42,
epoch: 43,
},
server: ServerInfoV2 {
pg_version: 14,
system_id: 0x1234567887654321,
tenant_id,
timeline_id,
wal_seg_size: 0x12345678,
},
proposer_uuid: {
let mut arr = timeline_id.as_arr();
arr.reverse();
arr
},
commit_lsn: Lsn(1234567800),
truncate_lsn: Lsn(123456780),
wal_start_lsn: Lsn(1234567800 - 8),
};
let ser = state.ser().unwrap();
#[rustfmt::skip]
let expected = [
// term
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// epoch
0x2b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// pg_version
0x0e, 0x00, 0x00, 0x00,
// system_id
0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
// tenant_id
0xcf, 0x04, 0x80, 0x92, 0x97, 0x07, 0xee, 0x75, 0x37, 0x23, 0x37, 0xef, 0xaa, 0x5e, 0xcf, 0x96,
// timeline_id
0x11, 0x2d, 0xed, 0x66, 0x42, 0x2a, 0xa5, 0xe9, 0x53, 0xe5, 0x44, 0x0f, 0xa5, 0x42, 0x7a, 0xc4,
// wal_seg_size
0x78, 0x56, 0x34, 0x12,
// proposer_uuid
0xc4, 0x7a, 0x42, 0xa5, 0x0f, 0x44, 0xe5, 0x53, 0xe9, 0xa5, 0x2a, 0x42, 0x66, 0xed, 0x2d, 0x11,
// commit_lsn
0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
// truncate_lsn
0x0c, 0xcd, 0x5b, 0x07, 0x00, 0x00, 0x00, 0x00,
// wal_start_lsn
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
];
assert_eq!(Hex(&ser), Hex(&expected));
let deser = SafeKeeperStateV1::des(&ser).unwrap();
assert_eq!(state, deser);
}
#[test]
fn roundtrip_v2() {
let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
let state = SafeKeeperStateV2 {
acceptor_state: AcceptorState {
term: 42,
term_history: TermHistory(vec![TermLsn {
lsn: Lsn(0x1),
term: 41,
}]),
},
server: ServerInfoV2 {
pg_version: 14,
system_id: 0x1234567887654321,
tenant_id,
timeline_id,
wal_seg_size: 0x12345678,
},
proposer_uuid: {
let mut arr = timeline_id.as_arr();
arr.reverse();
arr
},
commit_lsn: Lsn(1234567800),
truncate_lsn: Lsn(123456780),
wal_start_lsn: Lsn(1234567800 - 8),
};
let ser = state.ser().unwrap();
let expected = [
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56,
0x34, 0x12, 0xcf, 0x04, 0x80, 0x92, 0x97, 0x07, 0xee, 0x75, 0x37, 0x23, 0x37, 0xef,
0xaa, 0x5e, 0xcf, 0x96, 0x11, 0x2d, 0xed, 0x66, 0x42, 0x2a, 0xa5, 0xe9, 0x53, 0xe5,
0x44, 0x0f, 0xa5, 0x42, 0x7a, 0xc4, 0x78, 0x56, 0x34, 0x12, 0xc4, 0x7a, 0x42, 0xa5,
0x0f, 0x44, 0xe5, 0x53, 0xe9, 0xa5, 0x2a, 0x42, 0x66, 0xed, 0x2d, 0x11, 0x78, 0x02,
0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0x0c, 0xcd, 0x5b, 0x07, 0x00, 0x00, 0x00, 0x00,
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
];
assert_eq!(Hex(&ser), Hex(&expected));
let deser = SafeKeeperStateV2::des(&ser).unwrap();
assert_eq!(state, deser);
}
#[test]
fn roundtrip_v3() {
let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
let state = SafeKeeperStateV3 {
acceptor_state: AcceptorState {
term: 42,
term_history: TermHistory(vec![TermLsn {
lsn: Lsn(0x1),
term: 41,
}]),
},
server: ServerInfoV3 {
pg_version: 14,
system_id: 0x1234567887654321,
tenant_id,
timeline_id,
wal_seg_size: 0x12345678,
},
proposer_uuid: {
let mut arr = timeline_id.as_arr();
arr.reverse();
arr
},
commit_lsn: Lsn(1234567800),
truncate_lsn: Lsn(123456780),
wal_start_lsn: Lsn(1234567800 - 8),
};
let ser = state.ser().unwrap();
let expected = [
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56,
0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x66, 0x30, 0x34,
0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37,
0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x31, 0x31, 0x32, 0x64, 0x65, 0x64,
0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35,
0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, 0x78, 0x56,
0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x34, 0x37, 0x61,
0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39,
0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0x0c, 0xcd, 0x5b, 0x07, 0x00, 0x00,
0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
];
assert_eq!(Hex(&ser), Hex(&expected));
let deser = SafeKeeperStateV3::des(&ser).unwrap();
assert_eq!(state, deser);
}
#[test]
fn roundtrip_v4() {
let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
let state = SafeKeeperStateV4 {
tenant_id,
timeline_id,
acceptor_state: AcceptorState {
term: 42,
term_history: TermHistory(vec![TermLsn {
lsn: Lsn(0x1),
term: 41,
}]),
},
server: ServerInfo {
pg_version: 14,
system_id: 0x1234567887654321,
wal_seg_size: 0x12345678,
},
proposer_uuid: {
let mut arr = timeline_id.as_arr();
arr.reverse();
arr
},
peers: PersistedPeers(vec![(
NodeId(1),
PersistedPeerInfo {
backup_lsn: Lsn(1234567000),
term: 42,
flush_lsn: Lsn(1234567800 - 8),
commit_lsn: Lsn(1234567600),
},
)]),
commit_lsn: Lsn(1234567800),
s3_wal_lsn: Lsn(1234567300),
peer_horizon_lsn: Lsn(9999999),
remote_consistent_lsn: Lsn(1234560000),
};
let ser = state.ser().unwrap();
let expected = [
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x66, 0x30, 0x34, 0x38, 0x30,
0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33,
0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36, 0x20, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36,
0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34,
0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34, 0x2a, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x29, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x0e, 0x00, 0x00, 0x00, 0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12, 0x78, 0x56,
0x34, 0x12, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x63, 0x34, 0x37, 0x61,
0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39,
0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x96, 0x49, 0x00, 0x00,
0x00, 0x00, 0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe4, 0x95, 0x49,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00,
0x00, 0x00, 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
];
assert_eq!(Hex(&ser), Hex(&expected));
let deser = SafeKeeperStateV4::des(&ser).unwrap();
assert_eq!(state, deser);
}
}

View File

@@ -5,6 +5,7 @@ use std::fs::DirEntry;
use std::io::BufReader;
use std::io::Read;
use std::path::PathBuf;
use std::sync::Arc;
use anyhow::Result;
use camino::Utf8Path;
@@ -13,7 +14,6 @@ use postgres_ffi::XLogSegNo;
use serde::Deserialize;
use serde::Serialize;
use serde_with::{serde_as, DisplayFromStr};
use utils::id::NodeId;
use utils::id::TenantTimelineId;
use utils::id::{TenantId, TimelineId};
@@ -28,7 +28,7 @@ use crate::send_wal::WalSenderState;
use crate::GlobalTimelines;
/// Various filters that influence the resulting JSON output.
#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Args {
/// Dump all available safekeeper state. False by default.
pub dump_all: bool,
@@ -53,15 +53,76 @@ pub struct Args {
}
/// Response for debug dump request.
#[derive(Debug, Serialize, Deserialize)]
#[derive(Debug, Serialize)]
pub struct Response {
pub start_time: DateTime<Utc>,
pub finish_time: DateTime<Utc>,
pub timelines: Vec<Timeline>,
pub timelines: Vec<TimelineDumpSer>,
pub timelines_count: usize,
pub config: Config,
}
pub struct TimelineDumpSer {
pub tli: Arc<crate::timeline::Timeline>,
pub args: Args,
pub runtime: Arc<tokio::runtime::Runtime>,
}
impl std::fmt::Debug for TimelineDumpSer {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("TimelineDumpSer")
.field("tli", &self.tli.ttid)
.field("args", &self.args)
.finish()
}
}
impl Serialize for TimelineDumpSer {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let dump = self
.runtime
.block_on(build_from_tli_dump(self.tli.clone(), self.args.clone()));
dump.serialize(serializer)
}
}
async fn build_from_tli_dump(timeline: Arc<crate::timeline::Timeline>, args: Args) -> Timeline {
let control_file = if args.dump_control_file {
let mut state = timeline.get_state().await.1;
if !args.dump_term_history {
state.acceptor_state.term_history = TermHistory(vec![]);
}
Some(state)
} else {
None
};
let memory = if args.dump_memory {
Some(timeline.memory_dump().await)
} else {
None
};
let disk_content = if args.dump_disk_content {
// build_disk_content can fail, but we don't want to fail the whole
// request because of that.
build_disk_content(&timeline.timeline_dir).ok()
} else {
None
};
Timeline {
tenant_id: timeline.ttid.tenant_id,
timeline_id: timeline.ttid.timeline_id,
control_file,
memory,
disk_content,
}
}
/// Safekeeper configuration.
#[derive(Debug, Serialize, Deserialize)]
pub struct Config {
@@ -74,12 +135,9 @@ pub struct Config {
pub wal_backup_enabled: bool,
}
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
pub struct Timeline {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId,
pub control_file: Option<SafeKeeperState>,
pub memory: Option<Memory>,
@@ -140,8 +198,12 @@ pub async fn build(args: Args) -> Result<Response> {
GlobalTimelines::get_all()
};
// TODO: return Stream instead of Vec
let mut timelines = Vec::new();
let runtime = Arc::new(
tokio::runtime::Builder::new_current_thread()
.build()
.unwrap(),
);
for tli in ptrs_snapshot {
let ttid = tli.ttid;
if let Some(tenant_id) = args.tenant_id {
@@ -155,38 +217,11 @@ pub async fn build(args: Args) -> Result<Response> {
}
}
let control_file = if args.dump_control_file {
let mut state = tli.get_state().await.1;
if !args.dump_term_history {
state.acceptor_state.term_history = TermHistory(vec![]);
}
Some(state)
} else {
None
};
let memory = if args.dump_memory {
Some(tli.memory_dump().await)
} else {
None
};
let disk_content = if args.dump_disk_content {
// build_disk_content can fail, but we don't want to fail the whole
// request because of that.
build_disk_content(&tli.timeline_dir).ok()
} else {
None
};
let timeline = Timeline {
tenant_id: ttid.tenant_id,
timeline_id: ttid.timeline_id,
control_file,
memory,
disk_content,
};
timelines.push(timeline);
timelines.push(TimelineDumpSer {
tli,
args: args.clone(),
runtime: runtime.clone(),
});
}
let config = GlobalTimelines::get_global_config();

View File

@@ -86,6 +86,41 @@ paths:
default:
$ref: "#/components/responses/GenericError"
/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
- name: source_timeline_id
in: path
required: true
schema:
type: string
format: hex
post:
tags:
- "Timeline"
summary: Register new timeline as copy of existing timeline
description: ""
operationId: v1CopyTenantTimeline
requestBody:
content:
application/json:
schema:
$ref: "#/components/schemas/TimelineCopyRequest"
responses:
"201":
description: Timeline created
# TODO: return timeline info?
"403":
$ref: "#/components/responses/ForbiddenError"
default:
$ref: "#/components/responses/GenericError"
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
parameters:
@@ -210,6 +245,18 @@ components:
type: integer
minimum: 0
TimelineCopyRequest:
type: object
required:
- target_timeline_id
- until_lsn
properties:
target_timeline_id:
type: string
format: hex
until_lsn:
type: string
SkTimelineInfo:
type: object
required:

View File

@@ -4,7 +4,6 @@ use once_cell::sync::Lazy;
use postgres_ffi::WAL_SEGMENT_SIZE;
use safekeeper_api::models::SkTimelineInfo;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::str::FromStr;
@@ -13,7 +12,12 @@ use storage_broker::proto::SafekeeperTimelineInfo;
use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
use tokio::fs::File;
use tokio::io::AsyncReadExt;
use utils::http::endpoint::request_span;
use std::io::Write as _;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tracing::info_span;
use utils::http::endpoint::{request_span, ChannelWriter};
use crate::receive_wal::WalReceiverState;
use crate::safekeeper::Term;
@@ -62,11 +66,9 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
/// Same as TermLsn, but serializes LSN using display serializer
/// in Postgres format, i.e. 0/FFFFFFFF. Used only for the API response.
#[serde_as]
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct TermSwitchApiEntry {
pub term: Term,
#[serde_as(as = "DisplayFromStr")]
pub lsn: Lsn,
}
@@ -88,28 +90,18 @@ pub struct AcceptorStateStatus {
}
/// Info about timeline on safekeeper ready for reporting.
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
pub struct TimelineStatus {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId,
pub acceptor_state: AcceptorStateStatus,
pub pg_info: ServerInfo,
#[serde_as(as = "DisplayFromStr")]
pub flush_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub timeline_start_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub local_start_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub commit_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub backup_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub peer_horizon_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub remote_consistent_lsn: Lsn,
pub peers: Vec<PeerInfo>,
pub walsenders: Vec<WalSenderState>,
@@ -373,8 +365,52 @@ async fn dump_debug_handler(mut request: Request<Body>) -> Result<Response<Body>
.await
.map_err(ApiError::InternalServerError)?;
// TODO: use streaming response
json_response(StatusCode::OK, resp)
let started_at = std::time::Instant::now();
let (tx, rx) = mpsc::channel(1);
let body = Body::wrap_stream(ReceiverStream::new(rx));
let mut writer = ChannelWriter::new(128 * 1024, tx);
let response = Response::builder()
.status(200)
.header(hyper::header::CONTENT_TYPE, "application/octet-stream")
.body(body)
.unwrap();
let span = info_span!("blocking");
tokio::task::spawn_blocking(move || {
let _span = span.entered();
let res = serde_json::to_writer(&mut writer, &resp)
.map_err(std::io::Error::from)
.and_then(|_| writer.flush());
match res {
Ok(()) => {
tracing::info!(
bytes = writer.flushed_bytes(),
elapsed_ms = started_at.elapsed().as_millis(),
"responded /v1/debug_dump"
);
}
Err(e) => {
tracing::warn!("failed to write out /v1/debug_dump response: {e:#}");
// semantics of this error are quite... unclear. we want to error the stream out to
// abort the response to somehow notify the client that we failed.
//
// though, most likely the reason for failure is that the receiver is already gone.
drop(
writer
.tx
.blocking_send(Err(std::io::ErrorKind::BrokenPipe.into())),
);
}
}
});
Ok(response)
}
/// Safekeeper http router.

View File

@@ -44,8 +44,11 @@ pub struct AppendLogicalMessage {
// fields from AppendRequestHeader
pub term: Term,
#[serde(with = "utils::lsn::serde_as_u64")]
pub epoch_start_lsn: Lsn,
#[serde(with = "utils::lsn::serde_as_u64")]
pub begin_lsn: Lsn,
#[serde(with = "utils::lsn::serde_as_u64")]
pub truncate_lsn: Lsn,
pub pg_version: u32,
}

View File

@@ -1,3 +1,4 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use anyhow::{bail, Context, Result};
@@ -5,8 +6,6 @@ use tokio::io::AsyncWriteExt;
use tracing::info;
use utils::id::{TenantId, TenantTimelineId, TimelineId};
use serde_with::{serde_as, DisplayFromStr};
use crate::{
control_file, debug_dump,
http::routes::TimelineStatus,
@@ -15,12 +14,9 @@ use crate::{
};
/// Info about timeline on safekeeper ready for reporting.
#[serde_as]
#[derive(Debug, Serialize, Deserialize)]
pub struct Request {
#[serde_as(as = "DisplayFromStr")]
pub tenant_id: TenantId,
#[serde_as(as = "DisplayFromStr")]
pub timeline_id: TimelineId,
pub http_hosts: Vec<String>,
}
@@ -32,6 +28,16 @@ pub struct Response {
// TODO: add more fields?
}
/// Response for debug dump request.
#[derive(Debug, Serialize, Deserialize)]
pub struct DebugDumpResponse {
pub start_time: DateTime<Utc>,
pub finish_time: DateTime<Utc>,
pub timelines: Vec<debug_dump::Timeline>,
pub timelines_count: usize,
pub config: debug_dump::Config,
}
/// Find the most advanced safekeeper and pull timeline from it.
pub async fn handle_request(request: Request) -> Result<Response> {
let existing_tli = GlobalTimelines::get(TenantTimelineId::new(
@@ -103,7 +109,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
// Implementing our own scp over HTTP.
// At first, we need to fetch list of files from safekeeper.
let dump: debug_dump::Response = client
let dump: DebugDumpResponse = client
.get(format!(
"{}/v1/debug_dump?dump_all=true&tenant_id={}&timeline_id={}",
host, status.tenant_id, status.timeline_id

View File

@@ -52,7 +52,7 @@ impl From<(Term, Lsn)> for TermLsn {
}
}
#[derive(Clone, Serialize, Deserialize)]
#[derive(Clone, Serialize, Deserialize, PartialEq)]
pub struct TermHistory(pub Vec<TermLsn>);
impl TermHistory {
@@ -178,7 +178,7 @@ impl fmt::Debug for TermHistory {
pub type PgUuid = [u8; 16];
/// Persistent consensus state of the acceptor.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct AcceptorState {
/// acceptor's last term it voted for (advanced in 1 phase)
pub term: Term,
@@ -209,16 +209,16 @@ pub struct ServerInfo {
pub wal_seg_size: u32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PersistedPeerInfo {
/// LSN up to which safekeeper offloaded WAL to s3.
backup_lsn: Lsn,
pub backup_lsn: Lsn,
/// Term of the last entry.
term: Term,
pub term: Term,
/// LSN of the last record.
flush_lsn: Lsn,
pub flush_lsn: Lsn,
/// Up to which LSN safekeeper regards its WAL as committed.
commit_lsn: Lsn,
pub commit_lsn: Lsn,
}
impl PersistedPeerInfo {
@@ -232,12 +232,12 @@ impl PersistedPeerInfo {
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
/// Persistent information stored on safekeeper node
/// On disk data is prefixed by magic and format version and followed by checksum.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SafeKeeperState {
#[serde(with = "hex")]
pub tenant_id: TenantId,
@@ -1096,7 +1096,7 @@ mod tests {
use super::*;
use crate::wal_storage::Storage;
use std::{ops::Deref, time::Instant};
use std::{ops::Deref, str::FromStr, time::Instant};
// fake storage for tests
struct InMemoryState {
@@ -1314,4 +1314,98 @@ mod tests {
})
);
}
#[test]
fn test_sk_state_bincode_serde_roundtrip() {
use utils::Hex;
let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
let state = SafeKeeperState {
tenant_id,
timeline_id,
acceptor_state: AcceptorState {
term: 42,
term_history: TermHistory(vec![TermLsn {
lsn: Lsn(0x1),
term: 41,
}]),
},
server: ServerInfo {
pg_version: 14,
system_id: 0x1234567887654321,
wal_seg_size: 0x12345678,
},
proposer_uuid: {
let mut arr = timeline_id.as_arr();
arr.reverse();
arr
},
timeline_start_lsn: Lsn(0x12345600),
local_start_lsn: Lsn(0x12),
commit_lsn: Lsn(1234567800),
backup_lsn: Lsn(1234567300),
peer_horizon_lsn: Lsn(9999999),
remote_consistent_lsn: Lsn(1234560000),
peers: PersistedPeers(vec![(
NodeId(1),
PersistedPeerInfo {
backup_lsn: Lsn(1234567000),
term: 42,
flush_lsn: Lsn(1234567800 - 8),
commit_lsn: Lsn(1234567600),
},
)]),
};
let ser = state.ser().unwrap();
#[rustfmt::skip]
let expected = [
// tenant_id as length prefixed hex
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
// timeline_id as length prefixed hex
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34,
// term
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// length prefix
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// unsure why this order is swapped
0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// pg_version
0x0e, 0x00, 0x00, 0x00,
// systemid
0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
// wal_seg_size
0x78, 0x56, 0x34, 0x12,
// pguuid as length prefixed hex
0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
// timeline_start_lsn
0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00,
0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
// length prefix for persistentpeers
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// nodeid
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// backuplsn
0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
];
assert_eq!(Hex(&ser), Hex(&expected));
let deser = SafeKeeperState::des(&ser).unwrap();
assert_eq!(deser, state);
}
}

View File

@@ -16,7 +16,6 @@ use postgres_ffi::get_current_timestamp;
use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use tokio::io::{AsyncRead, AsyncWrite};
use utils::id::TenantTimelineId;
use utils::lsn::AtomicLsn;
@@ -313,10 +312,8 @@ impl WalSendersShared {
}
// Serialized is used only for pretty printing in json.
#[serde_as]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct WalSenderState {
#[serde_as(as = "DisplayFromStr")]
ttid: TenantTimelineId,
addr: SocketAddr,
conn_id: ConnectionId,

View File

@@ -5,10 +5,8 @@ use anyhow::{anyhow, bail, Result};
use camino::Utf8PathBuf;
use postgres_ffi::XLogSegNo;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use tokio::fs;
use serde_with::DisplayFromStr;
use std::cmp::max;
use std::sync::Arc;
use std::time::Duration;
@@ -42,7 +40,6 @@ use crate::SafeKeeperConf;
use crate::{debug_dump, wal_storage};
/// Things safekeeper should know about timeline state on peers.
#[serde_as]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PeerInfo {
pub sk_id: NodeId,
@@ -50,13 +47,10 @@ pub struct PeerInfo {
/// Term of the last entry.
pub last_log_term: Term,
/// LSN of the last record.
#[serde_as(as = "DisplayFromStr")]
pub flush_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub commit_lsn: Lsn,
/// Since which LSN safekeeper has WAL. TODO: remove this once we fill new
/// sk since backup_lsn.
#[serde_as(as = "DisplayFromStr")]
pub local_start_lsn: Lsn,
/// When info was received. Serde annotations are not very useful but make
/// the code compile -- we don't rely on this field externally.

View File

@@ -81,7 +81,6 @@ FALLBACK_DURATION = {
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-100000-100-0]": 0.55,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-0]": 12.189,
"test_runner/performance/test_seqscans.py::test_seqscans[vanilla-10000000-1-4]": 13.899,
"test_runner/performance/test_startup.py::test_startup": 890.114,
"test_runner/performance/test_startup.py::test_startup_simple": 2.51,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_off-10-5-5]": 527.245,
"test_runner/performance/test_wal_backpressure.py::test_heavy_write_workload[neon_on-10-5-5]": 583.46,

View File

@@ -626,6 +626,8 @@ class NeonEnvBuilder:
sk.stop(immediate=True)
for pageserver in self.env.pageservers:
pageserver.assert_no_metric_errors()
pageserver.stop(immediate=True)
if self.env.attachment_service is not None:
@@ -1784,6 +1786,21 @@ class NeonPageserver(PgProtocol):
assert not errors
def assert_no_metric_errors(self):
"""
Certain metrics should _always_ be zero: they track conditions that indicate a bug.
"""
if not self.running:
log.info(f"Skipping metrics check on pageserver {self.id}, it is not running")
return
for metric in [
"pageserver_tenant_manager_unexpected_errors_total",
"pageserver_deletion_queue_unexpected_errors_total",
]:
value = self.http_client().get_metric_value(metric)
assert value == 0, f"Nonzero {metric} == {value}"
def log_contains(self, pattern: str) -> Optional[str]:
"""Check that the pageserver log contains a line that matches the given regex"""
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
@@ -2868,7 +2885,7 @@ class SafekeeperHttpClient(requests.Session):
params = params or {}
res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)
res.raise_for_status()
res_json = res.json()
res_json = json.loads(res.text)
assert isinstance(res_json, dict)
return res_json
@@ -2968,24 +2985,33 @@ class S3Scrubber:
self.env = env
self.log_dir = log_dir
def scrubber_cli(self, args, timeout):
def scrubber_cli(self, args: list[str], timeout) -> str:
assert isinstance(self.env.pageserver_remote_storage, S3Storage)
s3_storage = self.env.pageserver_remote_storage
env = {
"REGION": s3_storage.bucket_region,
"BUCKET": s3_storage.bucket_name,
"BUCKET_PREFIX": s3_storage.prefix_in_bucket,
"RUST_LOG": "DEBUG",
}
env.update(s3_storage.access_env_vars())
if s3_storage.endpoint is not None:
env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint})
base_args = [self.env.neon_binpath / "s3_scrubber"]
base_args = [str(self.env.neon_binpath / "s3_scrubber")]
args = base_args + args
(output_path, _, status_code) = subprocess_capture(
self.log_dir, args, echo_stderr=True, echo_stdout=True, env=env, check=False
(output_path, stdout, status_code) = subprocess_capture(
self.log_dir,
args,
echo_stderr=True,
echo_stdout=True,
env=env,
check=False,
capture_stdout=True,
timeout=timeout,
)
if status_code:
log.warning(f"Scrub command {args} failed")
@@ -2994,8 +3020,18 @@ class S3Scrubber:
raise RuntimeError("Remote storage scrub failed")
def scan_metadata(self):
self.scrubber_cli(["scan-metadata"], timeout=30)
assert stdout is not None
return stdout
def scan_metadata(self) -> Any:
stdout = self.scrubber_cli(["scan-metadata", "--json"], timeout=30)
try:
return json.loads(stdout)
except:
log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:")
log.error(stdout)
raise
def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:

View File

@@ -249,7 +249,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
# this has been seen in the wild by tests with the below contradicting logging
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865
# this seems like a mock_s3 issue
log.warn(
log.warning(
f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0"
)
keys = 0
@@ -257,7 +257,7 @@ def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str
# this has been seen in one case with mock_s3:
# https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4938/6000769714/index.html#suites/3556ed71f2d69272a7014df6dcb02317/ca01e4f4d8d9a11f
# looking at moto impl, it might be there's a race with common prefix (sub directory) not going away with deletes
log.warn(
log.warning(
f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
)

View File

@@ -35,6 +35,7 @@ def subprocess_capture(
echo_stderr=False,
echo_stdout=False,
capture_stdout=False,
timeout=None,
**kwargs: Any,
) -> Tuple[str, Optional[str], int]:
"""Run a process and bifurcate its output to files and the `log` logger
@@ -104,7 +105,7 @@ def subprocess_capture(
stderr_handler = OutputHandler(p.stderr, stderr_f, echo=echo_stderr, capture=False)
stderr_handler.start()
r = p.wait()
r = p.wait(timeout=timeout)
stdout_handler.join()
stderr_handler.join()

View File

@@ -1,8 +1,10 @@
from contextlib import closing
from fixtures.benchmark_fixture import MetricReport
from fixtures.compare_fixtures import NeonCompare, PgCompare
from fixtures.pageserver.utils import wait_tenant_status_404
from fixtures.pg_version import PgVersion
from fixtures.types import Lsn
#
@@ -18,6 +20,8 @@ from fixtures.pg_version import PgVersion
def test_bulk_insert(neon_with_baseline: PgCompare):
env = neon_with_baseline
start_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
with closing(env.pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("create table huge (i int, j int);")
@@ -31,6 +35,13 @@ def test_bulk_insert(neon_with_baseline: PgCompare):
env.report_peak_memory_use()
env.report_size()
# Report amount of wal written. Useful for comparing vanilla wal format vs
# neon wal format, measuring neon write amplification, etc.
end_lsn = Lsn(env.pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0])
wal_written_bytes = end_lsn - start_lsn
wal_written_mb = round(wal_written_bytes / (1024 * 1024))
env.zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
# When testing neon, also check how long it takes the pageserver to reingest the
# wal from safekeepers. If this number is close to total runtime, then the pageserver
# is the bottleneck.

View File

@@ -1,6 +1,3 @@
from contextlib import closing
import pytest
import requests
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.neon_fixtures import NeonEnvBuilder
@@ -81,49 +78,3 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
# Imitate optimizations that console would do for the second start
endpoint.respec(skip_pg_catalog_updates=True)
# This test sometimes runs for longer than the global 5 minute timeout.
@pytest.mark.timeout(900)
def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
# Start
env.neon_cli.create_branch("test_startup")
with zenbenchmark.record_duration("startup_time"):
endpoint = env.endpoints.create_start("test_startup")
endpoint.safe_psql("select 1;")
# Restart
endpoint.stop_and_destroy()
with zenbenchmark.record_duration("restart_time"):
endpoint.create_start("test_startup")
endpoint.safe_psql("select 1;")
# Fill up
num_rows = 1000000 # 30 MB
num_tables = 100
with closing(endpoint.connect()) as conn:
with conn.cursor() as cur:
for i in range(num_tables):
cur.execute(f"create table t_{i} (i integer);")
cur.execute(f"insert into t_{i} values (generate_series(1,{num_rows}));")
# Read
with zenbenchmark.record_duration("read_time"):
endpoint.safe_psql("select * from t_0;")
# Read again
with zenbenchmark.record_duration("second_read_time"):
endpoint.safe_psql("select * from t_0;")
# Restart
endpoint.stop_and_destroy()
with zenbenchmark.record_duration("restart_with_data"):
endpoint.create_start("test_startup")
endpoint.safe_psql("select 1;")
# Read
with zenbenchmark.record_duration("read_after_restart"):
endpoint.safe_psql("select * from t_0;")

View File

@@ -72,7 +72,7 @@ class DdlForwardingContext:
self.dbs: Dict[str, str] = {}
self.roles: Dict[str, str] = {}
self.fail = False
endpoint = "/management/api/v2/roles_and_databases"
endpoint = "/test/roles_and_databases"
ddl_url = f"http://{host}:{port}{endpoint}"
self.pg.configure(
[

View File

@@ -1,11 +1,14 @@
import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
logical_replication_sync,
wait_for_last_flush_lsn,
)
from fixtures.types import Lsn
from fixtures.utils import query_scalar
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
@@ -147,3 +150,89 @@ COMMIT;
endpoint.start()
# it must be gone (but walproposer slot still exists, hence 1)
assert endpoint.safe_psql("select count(*) from pg_replication_slots")[0][0] == 1
# Test compute start at LSN page of which starts with contrecord
# https://github.com/neondatabase/neon/issues/5749
def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
env.neon_cli.create_branch("init")
endpoint = env.endpoints.create_start("init")
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
cur = endpoint.connect().cursor()
cur.execute("create table t(key int, value text)")
cur.execute("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
cur.execute("insert into replication_example values (1, 2)")
cur.execute("create publication pub1 for table replication_example")
# now start subscriber
vanilla_pg.start()
vanilla_pg.safe_psql("create table t(pk integer primary key, value text)")
vanilla_pg.safe_psql("CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int);")
log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
connstr = endpoint.connstr().replace("'", "''")
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
logical_replication_sync(vanilla_pg, endpoint)
vanilla_pg.stop()
with endpoint.cursor() as cur:
# measure how much space logical message takes. Sometimes first attempt
# creates huge message and then it stabilizes, have no idea why.
for _ in range(3):
lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
log.info(f"current_lsn={lsn_before}")
# Non-transactional logical message doesn't write WAL, only XLogInsert's
# it, so use transactional. Which is a bit problematic as transactional
# necessitates commit record. Alternatively we can do smth like
# select neon_xlogflush(pg_current_wal_insert_lsn());
# but isn't much better + that particular call complains on 'xlog flush
# request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
# page headers.
payload = "blahblah"
cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
log.info(
f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
)
# and write logical message spanning exactly as we want
lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
log.info(f"current_lsn={lsn_before}")
curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
offs = int(curr_lsn) % 8192
till_page = 8192 - offs
payload_len = (
till_page - logical_message_base - 8
) # not sure why 8 is here, it is deduced from experiments
log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
# payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
payload_len += 8
cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
# The calculations to hit the page boundary are very fuzzy, so just
# ignore test if we fail to reach it.
if not (int(supposedly_contrecord_end) % 8192 == 32):
pytest.skip("missed page boundary, bad luck")
cur.execute("insert into replication_example values (2, 3)")
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
endpoint.stop().start()
cur = endpoint.connect().cursor()
# this should flush current wal page
cur.execute("insert into replication_example values (3, 4)")
vanilla_pg.start()
logical_replication_sync(vanilla_pg, endpoint)
assert vanilla_pg.safe_psql(
"select sum(somedata) from replication_example"
) == endpoint.safe_psql("select sum(somedata) from replication_example")

View File

@@ -134,6 +134,9 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
env.neon_cli.pageserver_stop(env.pageserver.id)
env.neon_cli.safekeeper_stop()
# Keep NeonEnv state up to date, it usually owns starting/stopping services
env.pageserver.running = False
# Default start
res = env.neon_cli.raw_cli(["start"])
res.check_returncode()
@@ -155,6 +158,10 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID)
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1)
# Keep NeonEnv state up to date, it usually owns starting/stopping services
env.pageservers[0].running = False
env.pageservers[1].running = False
# Addressing a nonexistent ID throws
with pytest.raises(RuntimeError):
env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100)

View File

@@ -21,6 +21,7 @@ from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgBin,
S3Scrubber,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
@@ -234,8 +235,22 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder):
assert len(suffixed_objects) > 0
assert len(legacy_objects) > 0
# Flush through deletions to get a clean state for scrub: we are implicitly validating
# that our generations-enabled pageserver was able to do deletions of layers
# from earlier which don't have a generation.
env.pageserver.http_client().deletion_queue_flush(execute=True)
assert get_deletion_queue_unexpected_errors(env.pageserver.http_client()) == 0
# Having written a mixture of generation-aware and legacy index_part.json,
# ensure the scrubber handles the situation as expected.
metadata_summary = S3Scrubber(
neon_env_builder.test_output_dir, neon_env_builder
).scan_metadata()
assert metadata_summary["count"] == 1 # Scrubber should have seen our timeline
assert not metadata_summary["with_errors"]
assert not metadata_summary["with_warnings"]
def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_generations = True

View File

@@ -20,6 +20,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
endpoint = env.endpoints.create_start("main")
pageserver_http = env.pageserver.http_client()
assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
pg_conn = endpoint.connect()
cur = pg_conn.cursor()
@@ -52,6 +54,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
env.pageserver.stop()
env.pageserver.start()
# We reloaded our tenant
assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
cur.execute("SELECT count(*) FROM foo")
assert cur.fetchone() == (100000,)

View File

@@ -17,6 +17,10 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
n_restarts = 10
scale = 10
# Pageserver currently logs requests on non-active tenants at error level
# https://github.com/neondatabase/neon/issues/5784
env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])

View File

@@ -432,3 +432,47 @@ def test_sql_over_http_pool_idle(static_proxy: NeonProxy):
query(200, "BEGIN")
pid2 = query(200, GET_CONNECTION_PID_QUERY)["rows"][0]["pid"]
assert pid1 != pid2
@pytest.mark.timeout(60)
def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo')")
def query(status: int, query: str) -> Any:
return static_proxy.http_query(
query,
[],
user="http_auth",
password="http",
expected_code=status,
)
# query generates a million rows - should hit the 10MB reponse limit quickly
response = query(
400,
"select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
)
assert "response is too large (max is 10485760 bytes)" in response["message"]
def test_sql_over_http_pool_custom_types(static_proxy: NeonProxy):
static_proxy.safe_psql("create user http_auth with password 'http' superuser")
static_proxy.safe_psql("CREATE TYPE foo AS ENUM ('foo','bar','baz')")
def query(status: int, query: str) -> Any:
return static_proxy.http_query(
query,
[],
user="http_auth",
password="http",
expected_code=status,
)
response = query(
200,
"select array['foo'::foo, 'bar'::foo, 'baz'::foo] as data",
)
assert response["rows"][0]["data"] == ["foo", "bar", "baz"]

View File

@@ -63,6 +63,9 @@ def test_tenant_delete_smoke(
conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
)
# Default tenant and the one we created
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
# create two timelines one being the parent of another
parent = None
for timeline in ["first", "second"]:
@@ -88,7 +91,9 @@ def test_tenant_delete_smoke(
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()
@@ -104,6 +109,9 @@ def test_tenant_delete_smoke(
),
)
# Deletion updates the tenant count: the one default tenant remains
assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
class Check(enum.Enum):
RETRY_WITHOUT_RESTART = enum.auto()

View File

@@ -26,6 +26,16 @@ from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import query_scalar, wait_until
from prometheus_client.samples import Sample
# In tests that overlap endpoint activity with tenant attach/detach, there are
# a variety of warnings that the page service may emit when it cannot acquire
# an active tenant to serve a request
PERMIT_PAGE_SERVICE_ERRORS = [
".*page_service.*Tenant .* not found",
".*page_service.*Tenant .* is not active",
".*page_service.*cancelled",
".*page_service.*will not become active.*",
]
def do_gc_target(
pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
@@ -60,12 +70,7 @@ def test_tenant_reattach(
# create new nenant
tenant_id, timeline_id = env.neon_cli.create_tenant()
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
with endpoint.cursor() as cur:
@@ -235,10 +240,7 @@ def test_tenant_reattach_while_busy(
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
@@ -259,7 +261,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# first check for non existing tenant
tenant_id = TenantId.generate()
@@ -271,19 +273,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
assert excinfo.value.status_code == 404
# the error will be printed to the log too
env.pageserver.allowed_errors.append(".*NotFound: tenant *")
# create new nenant
tenant_id, timeline_id = env.neon_cli.create_tenant()
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
# assert tenant exists on disk
assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -345,12 +337,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
# create a new tenant
tenant_id, _ = env.neon_cli.create_tenant()
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# assert tenant exists on disk
assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -401,12 +388,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
# create a new tenant
tenant_id, _ = env.neon_cli.create_tenant()
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# assert tenant exists on disk
assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -453,12 +435,7 @@ def test_detach_while_attaching(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# Create table, and insert some rows. Make it big enough that it doesn't fit in
# shared_buffers, otherwise the SELECT after restart will just return answer
@@ -593,12 +570,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
data_id = 1
data_secret = "very secret secret"
@@ -649,12 +621,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):
tenant_id = env.initial_tenant
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
with pytest.raises(
@@ -693,12 +660,7 @@ def test_ignore_while_attaching(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Attempts to connect from compute to pageserver while the tenant is
# temporarily detached produces these errors in the pageserver log.
env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
env.pageserver.allowed_errors.append(
f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
)
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
data_id = 1
data_secret = "very secret secret"

Some files were not shown because too many files have changed in this diff Show More