Merge pull request #6993 from neondatabase/rc/2024-03-04

Release 2024-03-04
This commit is contained in:
Christian Schwarz
2024-03-04 13:08:44 +01:00
committed by GitHub
100 changed files with 2205 additions and 639 deletions

8
Cargo.lock generated
View File

@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "ahash"
version = "0.8.5"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
dependencies = [
"cfg-if",
"const-random",
@@ -1389,9 +1389,9 @@ dependencies = [
[[package]]
name = "crc32c"
version = "0.6.3"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
dependencies = [
"rustc_version",
]

View File

@@ -230,6 +230,8 @@ postgres=# select * from t;
> cargo neon stop
```
More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
#### Handling build failures
If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.

View File

@@ -18,8 +18,6 @@ use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use postgres::{Client, NoTls};
use tokio;
use tokio_postgres;
use tracing::{debug, error, info, instrument, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;

View File

@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
}
}
*/
use anyhow::{self, Result};
use anyhow::Result;
use anyhow::{bail, Context};
use bytes::Bytes;
use compute_api::spec::RemoteExtSpec;

View File

@@ -13,8 +13,6 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use num_cpus;
use serde_json;
use tokio::task;
use tracing::{error, info, warn};
use tracing_utils::http::OtelName;

26
control_plane/README.md Normal file
View File

@@ -0,0 +1,26 @@
# Control Plane and Neon Local
This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
## Example: Start with Postgres 16
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
```shell
cargo neon init --pg-version 16
cargo neon start
cargo neon tenant create --set-default --pg-version 16
cargo neon endpoint create main --pg-version 16
cargo neon endpoint start main
```
## Example: Create Test User and Database
By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
```shell
cargo neon endpoint create main --pg-version 16 --update-catalog true
cargo neon endpoint start main --create-test-user true
```
The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.

View File

@@ -0,0 +1,2 @@
ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;

View File

@@ -0,0 +1,4 @@
ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;

View File

@@ -1,9 +1,10 @@
use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use crate::PlacementPolicy;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
};
use pageserver_api::shard::TenantShardId;
@@ -117,9 +118,14 @@ async fn handle_tenant_create(
check_permissions(&req, Scope::PageServerApi)?;
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
// have no expectation of HA).
let placement_policy = PlacementPolicy::Single;
json_response(
StatusCode::CREATED,
service.tenant_create(create_req).await?,
service.tenant_create(create_req, placement_policy).await?,
)
}
@@ -185,6 +191,27 @@ async fn handle_tenant_location_config(
)
}
async fn handle_tenant_config_set(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::PageServerApi)?;
let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
}
async fn handle_tenant_config_get(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
}
async fn handle_tenant_time_travel_remote_storage(
service: Arc<Service>,
mut req: Request<Body>,
@@ -216,7 +243,15 @@ async fn handle_tenant_time_travel_remote_storage(
done_if_after_raw,
)
.await?;
json_response(StatusCode::OK, ())
}
async fn handle_tenant_secondary_download(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
service.tenant_secondary_download(tenant_id).await?;
json_response(StatusCode::OK, ())
}
@@ -551,12 +586,21 @@ pub fn make_router(
.delete("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_delete)
})
.put("/v1/tenant/config", |r| {
tenant_service_handler(r, handle_tenant_config_set)
})
.get("/v1/tenant/:tenant_id/config", |r| {
tenant_service_handler(r, handle_tenant_config_get)
})
.put("/v1/tenant/:tenant_id/location_config", |r| {
tenant_service_handler(r, handle_tenant_location_config)
})
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
})
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
tenant_service_handler(r, handle_tenant_secondary_download)
})
// Timeline operations
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_delete)

View File

@@ -13,14 +13,20 @@ mod schema;
pub mod service;
mod tenant_state;
#[derive(Clone, Serialize, Deserialize, Debug)]
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
enum PlacementPolicy {
/// Cheapest way to attach a tenant: just one pageserver, no secondary
Single,
/// Production-ready way to attach a tenant: one attached pageserver and
/// some number of secondaries.
Double(usize),
/// Do not attach to any pageservers
/// Create one secondary mode locations. This is useful when onboarding
/// a tenant, or for an idle tenant that we might want to bring online quickly.
Secondary,
/// Do not attach to any pageservers. This is appropriate for tenants that
/// have been idle for a long time, where we do not mind some delay in making
/// them available in future.
Detached,
}

View File

@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service};
use aws_config::{self, BehaviorVersion, Region};
use aws_config::{BehaviorVersion, Region};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
@@ -79,13 +79,38 @@ impl Secrets {
"neon-storage-controller-control-plane-jwt-token";
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
/// Load secrets from, in order of preference:
/// - CLI args if database URL is provided on the CLI
/// - Environment variables if DATABASE_URL is set.
/// - AWS Secrets Manager secrets
async fn load(args: &Cli) -> anyhow::Result<Self> {
match &args.database_url {
Some(url) => Self::load_cli(url, args),
None => Self::load_aws_sm().await,
None => match std::env::var(Self::DATABASE_URL_ENV) {
Ok(database_url) => Self::load_env(database_url),
Err(_) => Self::load_aws_sm().await,
},
}
}
fn load_env(database_url: String) -> anyhow::Result<Self> {
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
Err(_) => None,
};
Ok(Self {
database_url,
public_key,
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
})
}
async fn load_aws_sm() -> anyhow::Result<Self> {
let Ok(region) = std::env::var("AWS_REGION") else {
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");

View File

@@ -7,8 +7,10 @@ use self::split_state::SplitState;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use diesel::pg::PgConnection;
use diesel::prelude::*;
use diesel::Connection;
use diesel::{
Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
Selectable, SelectableHelper,
};
use pageserver_api::controller_api::NodeSchedulingPolicy;
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
@@ -331,7 +333,15 @@ impl Persistence {
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
};
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
let Some(g) = tsp.generation else {
// If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
// we only set generation_pageserver when setting generation.
return Err(DatabaseError::Logical(
"Generation should always be set after incrementing".to_string(),
));
};
result.insert(tenant_shard_id, Generation::new(g as u32));
}
Ok(result)
@@ -364,7 +374,85 @@ impl Persistence {
})
.await?;
Ok(Generation::new(updated.generation as u32))
// Generation is always non-null in the rseult: if the generation column had been NULL, then we
// should have experienced an SQL Confilict error while executing a query that tries to increment it.
debug_assert!(updated.generation.is_some());
let Some(g) = updated.generation else {
return Err(DatabaseError::Logical(
"Generation should always be set after incrementing".to_string(),
)
.into());
};
Ok(Generation::new(g as u32))
}
/// For use when updating a persistent property of a tenant, such as its config or placement_policy.
///
/// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
/// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing
/// that we only do the first time a tenant is set to an attached policy via /location_config.
pub(crate) async fn update_tenant_shard(
&self,
tenant_shard_id: TenantShardId,
input_placement_policy: PlacementPolicy,
input_config: TenantConfig,
input_generation: Option<Generation>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
let query = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
if let Some(input_generation) = input_generation {
// Update includes generation column
query
.set((
generation.eq(Some(input_generation.into().unwrap() as i32)),
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} else {
// Update does not include generation column
query
.set((
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
}
Ok(())
})
.await?;
Ok(())
}
pub(crate) async fn update_tenant_config(
&self,
input_tenant_id: TenantId,
input_config: TenantConfig,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
.execute(conn)?;
Ok(())
})
.await?;
Ok(())
}
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -375,7 +463,7 @@ impl Persistence {
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.set((
generation_pageserver.eq(i64::MAX),
generation_pageserver.eq(Option::<i64>::None),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
))
.execute(conn)?;
@@ -501,12 +589,15 @@ pub(crate) struct TenantShardPersistence {
pub(crate) shard_stripe_size: i32,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: i32,
// and use the incremented number when attaching.
//
// Generation is only None when first onboarding a tenant, where it may
// be in PlacementPolicy::Secondary and therefore have no valid generation state.
pub(crate) generation: Option<i32>,
// Currently attached pageserver
#[serde(rename = "pageserver")]
pub(crate) generation_pageserver: i64,
pub(crate) generation_pageserver: Option<i64>,
#[serde(default)]
pub(crate) placement_policy: String,

View File

@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
/// of a tenant's state from when we spawned a reconcile task.
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
pub(crate) generation: Generation,
pub(crate) generation: Option<Generation>,
pub(crate) intent: TargetState,
pub(crate) config: TenantConfig,
pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
&self.shard,
&self.config,
LocationConfigMode::AttachedStale,
Some(self.generation),
self.generation,
None,
);
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,16 +335,17 @@ impl Reconciler {
}
// Increment generation before attaching to new pageserver
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.await?;
self.generation = Some(
self.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.await?,
);
let dest_conf = build_location_config(
&self.shard,
&self.config,
LocationConfigMode::AttachedMulti,
Some(self.generation),
self.generation,
None,
);
@@ -401,7 +402,7 @@ impl Reconciler {
&self.shard,
&self.config,
LocationConfigMode::AttachedSingle,
Some(self.generation),
self.generation,
None,
);
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -433,22 +434,62 @@ impl Reconciler {
// If the attached pageserver is not attached, do so now.
if let Some(node_id) = self.intent.attached {
let mut wanted_conf =
attached_location_conf(self.generation, &self.shard, &self.config);
// If we are in an attached policy, then generation must have been set (null generations
// are only present when a tenant is initially loaded with a secondary policy)
debug_assert!(self.generation.is_some());
let Some(generation) = self.generation else {
return Err(ReconcileError::Other(anyhow::anyhow!(
"Attempted to attach with NULL generation"
)));
};
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
// Nothing to do
tracing::info!(%node_id, "Observed configuration already correct.")
}
_ => {
observed => {
// In all cases other than a matching observed configuration, we will
// reconcile this location. This includes locations with different configurations, as well
// as locations with unknown (None) observed state.
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.await?;
wanted_conf.generation = self.generation.into();
// The general case is to increment the generation. However, there are cases
// where this is not necessary:
// - if we are only updating the TenantConf part of the location
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
// and the location was already in the correct generation
let increment_generation = match observed {
None => true,
Some(ObservedStateLocation { conf: None }) => true,
Some(ObservedStateLocation {
conf: Some(observed),
}) => {
let generations_match = observed.generation == wanted_conf.generation;
use LocationConfigMode::*;
let mode_transition_requires_gen_inc =
match (observed.mode, wanted_conf.mode) {
// Usually the short-lived attachment modes (multi and stale) are only used
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
(AttachedSingle, AttachedStale) => false,
(AttachedMulti, AttachedSingle) => false,
(lhs, rhs) => lhs != rhs,
};
!generations_match || mode_transition_requires_gen_inc
}
};
if increment_generation {
let generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.await?;
self.generation = Some(generation);
wanted_conf.generation = generation.into();
}
tracing::info!(%node_id, "Observed configuration requires update.");
self.location_config(node_id, wanted_conf, None).await?;
self.compute_notify().await?;

View File

@@ -284,7 +284,6 @@ pub(crate) mod test_utils {
#[cfg(test)]
mod tests {
use super::*;
use utils::id::NodeId;
use crate::tenant_state::IntentState;
#[test]

View File

@@ -17,8 +17,8 @@ diesel::table! {
shard_number -> Int4,
shard_count -> Int4,
shard_stripe_size -> Int4,
generation -> Int4,
generation_pageserver -> Int8,
generation -> Nullable<Int4>,
generation_pageserver -> Nullable<Int8>,
placement_policy -> Varchar,
splitting -> Int2,
config -> Text,

View File

@@ -14,10 +14,13 @@ use control_plane::attachment_service::{
use diesel::result::DatabaseErrorKind;
use futures::{stream::FuturesUnordered, StreamExt};
use hyper::StatusCode;
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
use pageserver_api::{
controller_api::{
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
},
models::TenantConfigRequest,
};
use pageserver_api::{
models::{
@@ -65,6 +68,11 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
// some data in it.
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
// If we receive a call using Secondary mode initially, it will omit generation. We will initialize
// tenant shards into this generation, and as long as it remains in this generation, we will accept
// input generation from future requests as authoritative.
const INITIAL_GENERATION: Generation = Generation::new(0);
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
/// up on unresponsive pageservers and proceed.
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -167,6 +175,21 @@ impl From<ReconcileWaitError> for ApiError {
}
}
#[allow(clippy::large_enum_variant)]
enum TenantCreateOrUpdate {
Create((TenantCreateRequest, PlacementPolicy)),
Update(Vec<ShardUpdate>),
}
struct ShardUpdate {
tenant_shard_id: TenantShardId,
placement_policy: PlacementPolicy,
tenant_config: TenantConfig,
/// If this is None, generation is not updated.
generation: Option<Generation>,
}
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
@@ -571,6 +594,9 @@ impl Service {
// the shard so that a future [`TenantState::maybe_reconcile`] will try again.
tenant.pending_compute_notification = result.pending_compute_notification;
// Let the TenantState know it is idle.
tenant.reconcile_complete(result.sequence);
match result.result {
Ok(()) => {
for (node_id, loc) in &result.observed.locations {
@@ -661,8 +687,8 @@ impl Service {
// after when pageservers start up and register.
let mut node_ids = HashSet::new();
for tsp in &tenant_shard_persistence {
if tsp.generation_pageserver != i64::MAX {
node_ids.insert(tsp.generation_pageserver);
if let Some(node_id) = tsp.generation_pageserver {
node_ids.insert(node_id);
}
}
for node_id in node_ids {
@@ -699,18 +725,15 @@ impl Service {
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new();
if tsp.generation_pageserver != i64::MAX {
intent.set_attached(
&mut scheduler,
Some(NodeId(tsp.generation_pageserver as u64)),
);
if let Some(generation_pageserver) = tsp.generation_pageserver {
intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
}
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: Generation::new(tsp.generation as u32),
generation: tsp.generation.map(|g| Generation::new(g as u32)),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
@@ -790,8 +813,8 @@ impl Service {
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: 0,
generation: 0,
generation_pageserver: i64::MAX,
generation: Some(0),
generation_pageserver: None,
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
splitting: SplitState::default(),
@@ -846,7 +869,7 @@ impl Service {
.expect("Checked for existence above");
if let Some(new_generation) = new_generation {
tenant_state.generation = new_generation;
tenant_state.generation = Some(new_generation);
} else {
// This is a detach notification. We must update placement policy to avoid re-attaching
// during background scheduling/reconciliation, or during attachment service restart.
@@ -896,7 +919,7 @@ impl Service {
node_id,
ObservedStateLocation {
conf: Some(attached_location_conf(
tenant_state.generation,
tenant_state.generation.unwrap(),
&tenant_state.shard,
&tenant_state.config,
)),
@@ -910,7 +933,7 @@ impl Service {
Ok(AttachHookResponse {
gen: attach_req
.node_id
.map(|_| tenant_state.generation.into().unwrap()),
.map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
})
}
@@ -923,7 +946,7 @@ impl Service {
attachment: tenant_state.and_then(|s| {
s.intent
.get_attached()
.map(|ps| (s.generation.into().unwrap(), ps))
.map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
}),
}
}
@@ -973,7 +996,17 @@ impl Service {
continue;
};
shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
// If [`Persistence::re_attach`] selected this shard, it must have alread
// had a generation set.
debug_assert!(shard_state.generation.is_some());
let Some(old_gen) = shard_state.generation else {
// Should never happen: would only return incremented generation
// for a tenant that already had a non-null generation.
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Generation must be set while re-attaching"
)));
};
shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
if let Some(observed) = shard_state
.observed
.locations
@@ -1003,7 +1036,7 @@ impl Service {
for req_tenant in validate_req.tenants {
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_state.generation == Generation::new(req_tenant.gen);
let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
req_tenant.id,
@@ -1030,8 +1063,9 @@ impl Service {
pub(crate) async fn tenant_create(
&self,
create_req: TenantCreateRequest,
placement_policy: PlacementPolicy,
) -> Result<TenantCreateResponse, ApiError> {
let (response, waiters) = self.do_tenant_create(create_req).await?;
let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
Ok(response)
@@ -1040,6 +1074,7 @@ impl Service {
pub(crate) async fn do_tenant_create(
&self,
create_req: TenantCreateRequest,
placement_policy: PlacementPolicy,
) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
// This service expects to handle sharding itself: it is an error to try and directly create
// a particular shard here.
@@ -1065,9 +1100,27 @@ impl Service {
})
.collect::<Vec<_>>();
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
// have no expectation of HA).
let placement_policy: PlacementPolicy = PlacementPolicy::Single;
// If the caller specifies a None generation, it means "start from default". This is different
// to [`Self::tenant_location_config`], where a None generation is used to represent
// an incompletely-onboarded tenant.
let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
tracing::info!(
"tenant_create: secondary mode, generation is_some={}",
create_req.generation.is_some()
);
create_req.generation.map(Generation::new)
} else {
tracing::info!(
"tenant_create: not secondary mode, generation is_some={}",
create_req.generation.is_some()
);
Some(
create_req
.generation
.map(Generation::new)
.unwrap_or(INITIAL_GENERATION),
)
};
// Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller
// to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1079,8 +1132,10 @@ impl Service {
shard_number: tenant_shard_id.shard_number.0 as i32,
shard_count: tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
generation_pageserver: i64::MAX,
generation: initial_generation.map(|g| g.into().unwrap() as i32),
// The pageserver is not known until scheduling happens: we will set this column when
// incrementing the generation the first time we attach to a pageserver.
generation_pageserver: None,
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
config: serde_json::to_string(&create_req.config).unwrap(),
splitting: SplitState::default(),
@@ -1120,15 +1175,17 @@ impl Service {
))
})?;
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: entry
if let Some(node_id) = entry.get().intent.get_attached() {
let generation = entry
.get()
.intent
.get_attached()
.expect("We just set pageserver if it was None"),
generation: entry.get().generation.into().unwrap(),
});
.generation
.expect("Generation is set when in attached mode");
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: *node_id,
generation: generation.into().unwrap(),
});
}
continue;
}
@@ -1142,9 +1199,7 @@ impl Service {
placement_policy.clone(),
);
if let Some(create_gen) = create_req.generation {
state.generation = Generation::new(create_gen);
}
state.generation = initial_generation;
state.config = create_req.config.clone();
state.schedule(scheduler).map_err(|e| {
@@ -1153,14 +1208,18 @@ impl Service {
))
})?;
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: state
.intent
.get_attached()
.expect("We just set pageserver if it was None"),
generation: state.generation.into().unwrap(),
});
// Only include shards in result if we are attaching: the purpose
// of the response is to tell the caller where the shards are attached.
if let Some(node_id) = state.intent.get_attached() {
let generation = state
.generation
.expect("Generation is set when in attached mode");
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: *node_id,
generation: generation.into().unwrap(),
});
}
entry.insert(state)
}
};
@@ -1214,12 +1273,114 @@ impl Service {
Ok(())
}
/// This API is used by the cloud control plane to do coarse-grained control of tenants:
/// - Call with mode Attached* to upsert the tenant.
/// - Call with mode Detached to switch to PolicyMode::Detached
/// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
/// and transform it into either a tenant creation of a series of shard updates.
fn tenant_location_config_prepare(
&self,
tenant_id: TenantId,
req: TenantLocationConfigRequest,
) -> TenantCreateOrUpdate {
let mut updates = Vec::new();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, _scheduler) = locked.parts_mut();
// Use location config mode as an indicator of policy.
let placement_policy = match req.config.mode {
LocationConfigMode::Detached => PlacementPolicy::Detached,
LocationConfigMode::Secondary => PlacementPolicy::Secondary,
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
if nodes.len() > 1 {
PlacementPolicy::Double(1)
} else {
// Convenience for dev/test: if we just have one pageserver, import
// tenants into Single mode so that scheduling will succeed.
PlacementPolicy::Single
}
}
};
let mut create = true;
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Saw an existing shard: this is not a creation
create = false;
// Shards may have initially been created by a Secondary request, where we
// would have left generation as None.
//
// We only update generation the first time we see an attached-mode request,
// and if there is no existing generation set. The caller is responsible for
// ensuring that no non-storage-controller pageserver ever uses a higher
// generation than they passed in here.
use LocationConfigMode::*;
let set_generation = match req.config.mode {
AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
req.config.generation.map(Generation::new)
}
_ => None,
};
if shard.policy != placement_policy
|| shard.config != req.config.tenant_conf
|| set_generation.is_some()
{
updates.push(ShardUpdate {
tenant_shard_id: *shard_id,
placement_policy: placement_policy.clone(),
tenant_config: req.config.tenant_conf.clone(),
generation: set_generation,
});
}
}
if create {
use LocationConfigMode::*;
let generation = match req.config.mode {
AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
// If a caller provided a generation in a non-attached request, ignore it
// and leave our generation as None: this enables a subsequent update to set
// the generation when setting an attached mode for the first time.
_ => None,
};
TenantCreateOrUpdate::Create(
// Synthesize a creation request
(
TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation,
shard_parameters: ShardParameters {
// Must preserve the incoming shard_count do distinguish unsharded (0)
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
count: req.tenant_id.shard_count,
// We only import un-sharded or single-sharded tenants, so stripe
// size can be made up arbitrarily here.
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
},
config: req.config.tenant_conf,
},
placement_policy,
),
)
} else {
TenantCreateOrUpdate::Update(updates)
}
}
/// This API is used by the cloud control plane to migrate unsharded tenants that it created
/// directly with pageservers into this service.
///
/// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
/// secondary locations.
/// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
/// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
/// Think of the first attempt to call this API as a transfer of absolute authority over the
/// tenant's source of generation numbers.
///
/// The mode in this request coarse-grained control of tenants:
/// - Call with mode Attached* to upsert the tenant.
/// - Call with mode Secondary to either onboard a tenant without attaching it, or
/// to set an existing tenant to PolicyMode::Secondary
/// - Call with mode Detached to switch to PolicyMode::Detached
pub(crate) async fn tenant_location_config(
&self,
tenant_id: TenantId,
@@ -1231,131 +1392,96 @@ impl Service {
)));
}
let mut waiters = Vec::new();
// First check if this is a creation or an update
let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
let mut result = TenantLocationConfigResponse { shards: Vec::new() };
let maybe_create = {
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let (nodes, tenants, scheduler) = locked.parts_mut();
let waiters = match create_or_update {
TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
let (create_resp, waiters) =
self.do_tenant_create(create_req, placement_policy).await?;
result.shards = create_resp
.shards
.into_iter()
.map(|s| TenantShardLocation {
node_id: s.node_id,
shard_id: s.shard_id,
})
.collect();
waiters
}
TenantCreateOrUpdate::Update(updates) => {
// Persist updates
// Ordering: write to the database before applying changes in-memory, so that
// we will not appear time-travel backwards on a restart.
for ShardUpdate {
tenant_shard_id,
placement_policy,
tenant_config,
generation,
} in &updates
{
self.persistence
.update_tenant_shard(
*tenant_shard_id,
placement_policy.clone(),
tenant_config.clone(),
*generation,
)
.await?;
}
// Maybe we have existing shards
let mut create = true;
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Saw an existing shard: this is not a creation
create = false;
// Apply updates in-memory
let mut waiters = Vec::new();
{
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let (nodes, tenants, scheduler) = locked.parts_mut();
// Note that for existing tenants we do _not_ respect the generation in the request: this is likely
// to be stale. Once a tenant is created in this service, our view of generation is authoritative, and
// callers' generations may be ignored. This represents a one-way migration of tenants from the outer
// cloud control plane into this service.
for ShardUpdate {
tenant_shard_id,
placement_policy,
tenant_config,
generation: update_generation,
} in updates
{
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
tracing::warn!("Shard {tenant_shard_id} removed while updating");
continue;
};
// Use location config mode as an indicator of policy: if they ask for
// attached we go to default HA attached mode. If they ask for secondary
// we go to secondary-only mode. If they ask for detached we detach.
match req.config.mode {
LocationConfigMode::Detached => {
shard.policy = PlacementPolicy::Detached;
}
LocationConfigMode::Secondary => {
// TODO: implement secondary-only mode.
todo!();
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// TODO: persistence for changes in policy
if nodes.len() > 1 {
shard.policy = PlacementPolicy::Double(1)
} else {
// Convenience for dev/test: if we just have one pageserver, import
// tenants into Single mode so that scheduling will succeed.
shard.policy = PlacementPolicy::Single
shard.policy = placement_policy;
shard.config = tenant_config;
if let Some(generation) = update_generation {
shard.generation = Some(generation);
}
shard.schedule(scheduler)?;
let maybe_waiter = shard.maybe_reconcile(
result_tx.clone(),
nodes,
&compute_hook,
&self.config,
&self.persistence,
&self.gate,
&self.cancel,
);
if let Some(waiter) = maybe_waiter {
waiters.push(waiter);
}
if let Some(node_id) = shard.intent.get_attached() {
result.shards.push(TenantShardLocation {
shard_id: tenant_shard_id,
node_id: *node_id,
})
}
}
}
shard.schedule(scheduler)?;
let maybe_waiter = shard.maybe_reconcile(
result_tx.clone(),
nodes,
&compute_hook,
&self.config,
&self.persistence,
&self.gate,
&self.cancel,
);
if let Some(waiter) = maybe_waiter {
waiters.push(waiter);
}
if let Some(node_id) = shard.intent.get_attached() {
result.shards.push(TenantShardLocation {
shard_id: *shard_id,
node_id: *node_id,
})
}
waiters
}
if create {
// Validate request mode
match req.config.mode {
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
// When using this API to onboard an existing tenant to this service, it must start in
// an attached state, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Imported tenant must be in attached mode"
)));
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// Pass
}
}
// Validate request generation
let Some(generation) = req.config.generation else {
// We can only import attached tenants, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Generation is mandatory when importing tenant"
)));
};
// Synthesize a creation request
Some(TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: Some(generation),
shard_parameters: ShardParameters {
// Must preserve the incoming shard_count do distinguish unsharded (0)
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
count: req.tenant_id.shard_count,
// We only import un-sharded or single-sharded tenants, so stripe
// size can be made up arbitrarily here.
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
},
config: req.config.tenant_conf,
})
} else {
None
}
};
let waiters = if let Some(create_req) = maybe_create {
let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
result.shards = create_resp
.shards
.into_iter()
.map(|s| TenantShardLocation {
node_id: s.node_id,
shard_id: s.shard_id,
})
.collect();
waiters
} else {
waiters
};
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1375,6 +1501,91 @@ impl Service {
Ok(result)
}
pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
let tenant_id = req.tenant_id;
let config = req.config;
self.persistence
.update_tenant_config(req.tenant_id, config.clone())
.await?;
let waiters = {
let mut waiters = Vec::new();
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let (nodes, tenants, _scheduler) = locked.parts_mut();
for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
shard.config = config.clone();
if let Some(waiter) = shard.maybe_reconcile(
result_tx.clone(),
nodes,
&compute_hook,
&self.config,
&self.persistence,
&self.gate,
&self.cancel,
) {
waiters.push(waiter);
}
}
waiters
};
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
// Treat this as success because we have stored the configuration. If e.g.
// a node was unavailable at this time, it should not stop us accepting a
// configuration change.
tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
}
Ok(())
}
pub(crate) fn tenant_config_get(
&self,
tenant_id: TenantId,
) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
let config = {
let locked = self.inner.read().unwrap();
match locked
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.next()
{
Some((_tenant_shard_id, shard)) => shard.config.clone(),
None => {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant not found").into(),
))
}
}
};
// Unlike the pageserver, we do not have a set of global defaults: the config is
// entirely per-tenant. Therefore the distinction between `tenant_specific_overrides`
// and `effective_config` in the response is meaningless, but we retain that syntax
// in order to remain compatible with the pageserver API.
let response = HashMap::from([
(
"tenant_specific_overrides",
serde_json::to_value(&config)
.context("serializing tenant specific overrides")
.map_err(ApiError::InternalServerError)?,
),
(
"effective_config",
serde_json::to_value(&config)
.context("serializing effective config")
.map_err(ApiError::InternalServerError)?,
),
]);
Ok(response)
}
pub(crate) async fn tenant_time_travel_remote_storage(
&self,
time_travel_req: &TenantTimeTravelRequest,
@@ -1460,6 +1671,60 @@ impl Service {
})?;
}
}
Ok(())
}
pub(crate) async fn tenant_secondary_download(
&self,
tenant_id: TenantId,
) -> Result<(), ApiError> {
// Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
for node_id in shard.intent.get_secondary() {
let node = locked
.nodes
.get(node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
}
targets
};
// TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
// downloads, they can return a clean 202 response instead of the HTTP client timing out.
// Issue concurrent requests to all shards' locations
let mut futs = FuturesUnordered::new();
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
futs.push(async move {
let result = client.tenant_secondary_download(tenant_shard_id).await;
(result, node)
})
}
// Handle any errors returned by pageservers. This includes cases like this request racing with
// a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
// well as more general cases like 503s, 500s, or timeouts.
while let Some((result, node)) = futs.next().await {
let Err(e) = result else { continue };
// Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
// is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
// than they had hoped for.
tracing::warn!(
"Ignoring tenant secondary download error from pageserver {}: {e}",
node.id,
);
}
Ok(())
}
@@ -2039,8 +2304,8 @@ impl Service {
// Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
// populate the correct generation as part of its transaction, to protect us
// against racing with changes in the state of the parent.
generation: 0,
generation_pageserver: target.node.id.0 as i64,
generation: None,
generation_pageserver: Some(target.node.id.0 as i64),
placement_policy: serde_json::to_string(&policy).unwrap(),
// TODO: get the config out of the map
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2161,7 +2426,8 @@ impl Service {
.expect("It was present, we just split it");
let old_attached = old_state.intent.get_attached().unwrap();
old_state.intent.clear(scheduler);
(old_attached, old_state.generation, old_state.config.clone())
let generation = old_state.generation.expect("Shard must have been attached");
(old_attached, generation, old_state.config.clone())
};
for child in child_ids {
@@ -2182,7 +2448,7 @@ impl Service {
child_state.observed = ObservedState {
locations: child_observed,
};
child_state.generation = generation;
child_state.generation = Some(generation);
child_state.config = config.clone();
// The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2247,6 +2513,7 @@ impl Service {
match shard.policy {
PlacementPolicy::Single => {
shard.intent.clear_secondary(scheduler);
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
}
PlacementPolicy::Double(_n) => {
// If our new attached node was a secondary, it no longer should be.
@@ -2256,6 +2523,12 @@ impl Service {
if let Some(old_attached) = old_attached {
shard.intent.push_secondary(scheduler, old_attached);
}
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
}
PlacementPolicy::Secondary => {
shard.intent.clear(scheduler);
shard.intent.push_secondary(scheduler, migrate_req.node_id);
}
PlacementPolicy::Detached => {
return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2263,9 +2536,6 @@ impl Service {
)))
}
}
shard
.intent
.set_attached(scheduler, Some(migrate_req.node_id));
tracing::info!("Migrating: new intent {:?}", shard.intent);
shard.sequence = shard.sequence.next();
@@ -2593,7 +2863,7 @@ impl Service {
observed_loc.conf = None;
}
if tenant_state.intent.notify_offline(config_req.node_id) {
if tenant_state.intent.demote_attached(config_req.node_id) {
tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(scheduler) {
Err(e) => {
@@ -2660,6 +2930,9 @@ impl Service {
/// Helper for methods that will try and call pageserver APIs for
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
/// is attached somewhere.
///
/// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
/// an attached policy. We should error out if it isn't.
fn ensure_attached_schedule(
&self,
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,

View File

@@ -53,8 +53,11 @@ pub(crate) struct TenantState {
pub(crate) sequence: Sequence,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching
pub(crate) generation: Generation,
// and use the incremented number when attaching.
//
// None represents an incompletely onboarded tenant via the [`Service::location_config`]
// API, where this tenant may only run in PlacementPolicy::Secondary.
pub(crate) generation: Option<Generation>,
// High level description of how the tenant should be set up. Provided
// externally.
@@ -181,6 +184,13 @@ impl IntentState {
}
}
/// Remove the last secondary node from the list of secondaries
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
if let Some(node_id) = self.secondary.pop() {
scheduler.node_dec_ref(node_id);
}
}
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
@@ -208,11 +218,13 @@ impl IntentState {
&self.secondary
}
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
/// If the node is in use as the attached location, demote it into
/// the list of secondary locations. This is used when a node goes offline,
/// and we want to use a different node for attachment, but not permanently
/// forget the location on the offline node.
///
/// Returns true if a change was made
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
@@ -315,7 +327,7 @@ pub(crate) struct ReconcileResult {
pub(crate) result: Result<(), ReconcileError>,
pub(crate) tenant_shard_id: TenantShardId,
pub(crate) generation: Generation,
pub(crate) generation: Option<Generation>,
pub(crate) observed: ObservedState,
/// Set [`TenantState::pending_compute_notification`] from this flag
@@ -340,7 +352,7 @@ impl TenantState {
tenant_shard_id,
policy,
intent: IntentState::default(),
generation: Generation::new(0),
generation: Some(Generation::new(0)),
shard,
observed: ObservedState::default(),
config: TenantConfig::default(),
@@ -438,10 +450,16 @@ impl TenantState {
// more work on the same pageservers we're already using.
let mut modified = false;
// Add/remove nodes to fulfil policy
use PlacementPolicy::*;
match self.policy {
Single => {
// Should have exactly one attached, and zero secondaries
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
@@ -451,6 +469,23 @@ impl TenantState {
}
}
Double(secondary_count) => {
let retain_secondaries = if self.intent.attached.is_none()
&& scheduler.node_preferred(&self.intent.secondary).is_some()
{
// If we have no attached, and one of the secondaries is elegible to be promoted, retain
// one more secondary than we usually would, as one of them will become attached futher down this function.
secondary_count + 1
} else {
secondary_count
};
while self.intent.secondary.len() > retain_secondaries {
// We have no particular preference for one secondary location over another: just
// arbitrarily drop from the end
self.intent.pop_secondary(scheduler);
modified = true;
}
// Should have exactly one attached, and N secondaries
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
@@ -463,15 +498,28 @@ impl TenantState {
modified = true;
}
}
Detached => {
// Should have no attached or secondary pageservers
if self.intent.attached.is_some() {
self.intent.set_attached(scheduler, None);
Secondary => {
if let Some(node_id) = self.intent.get_attached() {
// Populate secondary by demoting the attached node
self.intent.demote_attached(*node_id);
modified = true;
} else if self.intent.secondary.is_empty() {
// Populate secondary by scheduling a fresh node
let node_id = scheduler.schedule_shard(&[])?;
self.intent.push_secondary(scheduler, node_id);
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
while self.intent.secondary.len() > 1 {
// We have no particular preference for one secondary location over another: just
// arbitrarily drop from the end
self.intent.pop_secondary(scheduler);
modified = true;
}
}
Detached => {
// Never add locations in this mode
if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
self.intent.clear(scheduler);
modified = true;
}
}
@@ -518,7 +566,12 @@ impl TenantState {
fn dirty(&self) -> bool {
if let Some(node_id) = self.intent.attached {
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
// Maybe panic: it is a severe bug if we try to attach while generation is null.
let generation = self
.generation
.expect("Attempted to enter attached state without a generation");
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
Some(_) | None => {
@@ -596,6 +649,10 @@ impl TenantState {
// Reconcile already in flight for the current sequence?
if let Some(handle) = &self.reconciler {
if handle.sequence == self.sequence {
tracing::info!(
"Reconciliation already in progress for sequence {:?}",
self.sequence,
);
return Some(ReconcilerWaiter {
tenant_shard_id: self.tenant_shard_id,
seq_wait: self.waiter.clone(),
@@ -615,6 +672,10 @@ impl TenantState {
return None;
};
// Advance the sequence before spawning a reconciler, so that sequence waiters
// can distinguish between before+after the reconcile completes.
self.sequence = self.sequence.next();
let reconciler_cancel = cancel.child_token();
let mut reconciler = Reconciler {
tenant_shard_id: self.tenant_shard_id,
@@ -716,6 +777,17 @@ impl TenantState {
})
}
/// Called when a ReconcileResult has been emitted and the service is updating
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
/// the handle to indicate there is no longer a reconciliation in progress.
pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
if let Some(reconcile_handle) = &self.reconciler {
if reconcile_handle.sequence <= sequence {
self.reconciler = None;
}
}
}
// If we had any state at all referring to this node ID, drop it. Does not
// attempt to reschedule.
pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -736,13 +808,8 @@ impl TenantState {
shard_number: self.tenant_shard_id.shard_number.0 as i32,
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: self.shard.stripe_size.0 as i32,
generation: self.generation.into().unwrap_or(0) as i32,
generation_pageserver: self
.intent
.get_attached()
.map(|n| n.0 as i64)
.unwrap_or(i64::MAX),
generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(),
@@ -805,8 +872,10 @@ pub(crate) mod tests {
assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary
let changed = tenant_state.intent.notify_offline(attached_node_id);
let changed = tenant_state.intent.demote_attached(attached_node_id);
assert!(changed);
assert!(tenant_state.intent.attached.is_none());
assert_eq!(tenant_state.intent.secondary.len(), 2);
// Update the scheduler state to indicate the node is offline
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;

View File

@@ -200,7 +200,7 @@ impl AttachmentService {
"localhost",
"-p",
&format!("{}", self.postgres_port),
&DB_NAME,
DB_NAME,
])
.output()
.await

View File

@@ -605,7 +605,7 @@ impl Endpoint {
let conn_str = self.connstr("cloud_admin", "postgres");
println!("Starting postgres node at '{}'", conn_str);
if create_test_user {
let conn_str = self.connstr("user", "neondb");
let conn_str = self.connstr("test", "neondb");
println!("Also at '{}'", conn_str);
}
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));

View File

@@ -14,7 +14,6 @@ use byteorder::{BigEndian, ReadBytesExt};
use postgres_ffi::BLCKSZ;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use strum_macros;
use utils::{
completion,
history_buffer::HistoryBufferWithDropCounter,
@@ -1077,7 +1076,6 @@ impl PagestreamBeMessage {
#[cfg(test)]
mod tests {
use bytes::Buf;
use serde_json::json;
use super::*;

View File

@@ -6,7 +6,6 @@ use crate::{
};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use thiserror;
use utils::id::TenantId;
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -656,10 +655,7 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
#[cfg(test)]
mod tests {
use std::str::FromStr;
use bincode;
use utils::{id::TenantId, Hex};
use utils::Hex;
use super::*;

View File

@@ -623,9 +623,7 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
mod fs_tests {
use super::*;
use bytes::Bytes;
use camino_tempfile::tempdir;
use futures_util::Stream;
use std::{collections::HashMap, io::Write};
async fn read_and_check_metadata(

View File

@@ -1040,7 +1040,7 @@ mod tests {
Some("test/prefix/"),
Some("/test/prefix/"),
];
let expected_outputs = vec![
let expected_outputs = [
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![

View File

@@ -1,7 +1,6 @@
// For details about authentication see docs/authentication.md
use arc_swap::ArcSwap;
use serde;
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
use anyhow::Result;

View File

@@ -4,7 +4,9 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion(TaskTrackerToken);
pub struct Completion {
_token: TaskTrackerToken,
}
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
@@ -49,5 +51,5 @@ pub fn channel() -> (Completion, Barrier) {
tracker.close();
let token = tracker.token();
(Completion(token), Barrier(tracker))
(Completion { _token: token }, Barrier(tracker))
}

View File

@@ -45,7 +45,7 @@ impl Generation {
Self::Broken
}
pub fn new(v: u32) -> Self {
pub const fn new(v: u32) -> Self {
Self::Valid(v)
}

View File

@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tracing::{self, debug, info, info_span, warn, Instrument};
use tracing::{debug, info, info_span, warn, Instrument};
use std::future::Future;
use std::str::FromStr;
@@ -156,6 +156,10 @@ pub struct ChannelWriter {
buffer: BytesMut,
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
/// Time spent waiting for the channel to make progress. It is not the same as time to upload a
/// buffer because we cannot know anything about that, but this should allow us to understand
/// the actual time taken without the time spent `std::thread::park`ed.
wait_time: std::time::Duration,
}
impl ChannelWriter {
@@ -168,6 +172,7 @@ impl ChannelWriter {
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
wait_time: std::time::Duration::ZERO,
}
}
@@ -180,6 +185,8 @@ impl ChannelWriter {
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
let wait_started_at = std::time::Instant::now();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -192,6 +199,9 @@ impl ChannelWriter {
// sending it to the client.
Ok(())
});
self.wait_time += wait_started_at.elapsed();
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
@@ -202,6 +212,10 @@ impl ChannelWriter {
pub fn flushed_bytes(&self) -> usize {
self.written
}
pub fn wait_time(&self) -> std::time::Duration {
self.wait_time
}
}
impl std::io::Write for ChannelWriter {
@@ -252,22 +266,52 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
let span = info_span!("blocking");
tokio::task::spawn_blocking(move || {
// there are situations where we lose scraped metrics under load, try to gather some clues
// since all nodes are queried this, keep the message count low.
let spawned_at = std::time::Instant::now();
let _span = span.entered();
let metrics = metrics::gather();
let gathered_at = std::time::Instant::now();
let res = encoder
.encode(&metrics, &mut writer)
.and_then(|_| writer.flush().map_err(|e| e.into()));
// this instant is not when we finally got the full response sent, sending is done by hyper
// in another task.
let encoded_at = std::time::Instant::now();
let spawned_in = spawned_at - started_at;
let collected_in = gathered_at - spawned_at;
// remove the wait time here in case the tcp connection was clogged
let encoded_in = encoded_at - gathered_at - writer.wait_time();
let total = encoded_at - started_at;
match res {
Ok(()) => {
tracing::info!(
bytes = writer.flushed_bytes(),
elapsed_ms = started_at.elapsed().as_millis(),
total_ms = total.as_millis(),
spawning_ms = spawned_in.as_millis(),
collection_ms = collected_in.as_millis(),
encoding_ms = encoded_in.as_millis(),
"responded /metrics"
);
}
Err(e) => {
tracing::warn!("failed to write out /metrics response: {e:#}");
// there is a chance that this error is not the BrokenPipe we generate in the writer
// for "closed connection", but it is highly unlikely.
tracing::warn!(
after_bytes = writer.flushed_bytes(),
total_ms = total.as_millis(),
spawning_ms = spawned_in.as_millis(),
collection_ms = collected_in.as_millis(),
encoding_ms = encoded_in.as_millis(),
"failed to write out /metrics response: {e:?}"
);
// semantics of this error are quite... unclear. we want to error the stream out to
// abort the response to somehow notify the client that we failed.
//

View File

@@ -415,7 +415,6 @@ mod tests {
use super::*;
use serde::ser::Serialize;
use serde_assert::{Deserializer, Serializer, Token, Tokens};
#[test]

View File

@@ -1,6 +1,6 @@
#![warn(missing_docs)]
use std::cmp::{Eq, Ordering, PartialOrd};
use std::cmp::{Eq, Ordering};
use std::collections::BinaryHeap;
use std::fmt::Debug;
use std::mem;
@@ -249,7 +249,6 @@ where
mod tests {
use super::*;
use std::sync::Arc;
use std::time::Duration;
impl MonotonicCounter<i32> for i32 {
fn cnt_advance(&mut self, val: i32) {

View File

@@ -221,7 +221,7 @@ impl RcuWaitList {
#[cfg(test)]
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
use std::sync::Mutex;
use std::time::Duration;
#[tokio::test]

View File

@@ -239,7 +239,6 @@ mod tests {
use std::{
convert::Infallible,
pin::{pin, Pin},
sync::atomic::{AtomicUsize, Ordering},
time::Duration,
};

View File

@@ -6,7 +6,6 @@ use futures::future::BoxFuture;
use futures::{Stream, StreamExt};
use itertools::Itertools;
use pin_project_lite::pin_project;
use std::cmp::Ord;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
use std::future::Future;

View File

@@ -20,7 +20,6 @@ use std::num::NonZeroUsize;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use toml_edit;
use toml_edit::{Document, Item};
use camino::{Utf8Path, Utf8PathBuf};
@@ -212,9 +211,9 @@ pub struct PageServerConf {
pub log_format: LogFormat,
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
/// loading such tenants, vs. other work in the system.
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
///
/// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
pub concurrent_tenant_warmup: ConfigurableSemaphore,
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
@@ -1203,10 +1202,7 @@ impl ConfigurableSemaphore {
#[cfg(test)]
mod tests {
use std::{
fs,
num::{NonZeroU32, NonZeroUsize},
};
use std::{fs, num::NonZeroU32};
use camino_tempfile::{tempdir, Utf8TempDir};
use pageserver_api::models::EvictionPolicy;

View File

@@ -1,7 +1,5 @@
use super::*;
use std::collections::HashMap;
use std::time::SystemTime;
use utils::lsn::Lsn;
#[test]
fn startup_collected_timeline_metrics_before_advancing() {

View File

@@ -20,10 +20,9 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::Deserialize;
use serde::Serialize;
use thiserror::Error;
use tokio;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use tracing::{self, debug, error};
use tracing::{debug, error};
use utils::crashsafe::path_with_suffix_extension;
use utils::generation::Generation;
use utils::id::TimelineId;
@@ -726,7 +725,7 @@ mod test {
use camino::Utf8Path;
use hex_literal::hex;
use pageserver_api::shard::ShardIndex;
use std::{io::ErrorKind, time::Duration};
use std::io::ErrorKind;
use tracing::info;
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -735,10 +734,7 @@ mod test {
use crate::{
control_plane_client::RetryForeverError,
repository::Key,
tenant::{
harness::TenantHarness, remote_timeline_client::remote_timeline_path,
storage_layer::DeltaFileName,
},
tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
};
use super::*;
@@ -1161,13 +1157,8 @@ mod test {
pub(crate) mod mock {
use tracing::info;
use crate::tenant::remote_timeline_client::remote_layer_path;
use super::*;
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
use std::sync::atomic::{AtomicUsize, Ordering};
pub struct ConsumerState {
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,

View File

@@ -58,6 +58,7 @@ use utils::{completion, id::TimelineId};
use crate::{
config::PageServerConf,
metrics::disk_usage_based_eviction::METRICS,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
self,
@@ -65,7 +66,6 @@ use crate::{
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
Timeline,
},
};
@@ -409,13 +409,23 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
"running disk usage based eviction due to pressure"
);
let candidates =
let (candidates, collection_time) = {
let started_at = std::time::Instant::now();
match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
EvictionCandidates::Cancelled => {
return Ok(IterationOutcome::Cancelled);
}
EvictionCandidates::Finished(partitioned) => partitioned,
};
EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
}
};
METRICS.layers_collected.inc_by(candidates.len() as u64);
tracing::info!(
elapsed_ms = collection_time.as_millis(),
total_layers = candidates.len(),
"collection completed"
);
// Debug-log the list of candidates
let now = SystemTime::now();
@@ -446,9 +456,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let selection = select_victims(&candidates, usage_pre);
let (evicted_amount, usage_planned) =
select_victims(&candidates, usage_pre).into_amount_and_planned();
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
METRICS.layers_selected.inc_by(evicted_amount as u64);
// phase2: evict layers
@@ -477,9 +488,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
if let Some(next) = next {
match next {
Ok(Ok(file_size)) => {
METRICS.layers_evicted.inc();
usage_assumed.add_available_bytes(file_size);
}
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
Ok(Err((
file_size,
EvictionError::NotFound
| EvictionError::Downloaded
| EvictionError::Timeout,
))) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
@@ -495,7 +512,10 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// calling again when consumed_all is fine as evicted is fused.
let Some((_partition, candidate)) = evicted.next() else {
consumed_all = true;
if !consumed_all {
tracing::info!("all evictions started, waiting");
consumed_all = true;
}
continue;
};
@@ -503,11 +523,15 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
EvictionLayer::Attached(layer) => {
let file_size = layer.layer_desc().file_size;
js.spawn(async move {
layer
.evict_and_wait()
.await
.map(|()| file_size)
.map_err(|e| (file_size, e))
// have a low eviction waiting timeout because our LRU calculations go stale fast;
// also individual layer evictions could hang because of bugs and we do not want to
// pause disk_usage_based_eviction for such.
let timeout = std::time::Duration::from_secs(5);
match layer.evict_and_wait(timeout).await {
Ok(()) => Ok(file_size),
Err(e) => Err((file_size, e)),
}
});
}
EvictionLayer::Secondary(layer) => {
@@ -529,6 +553,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
(usage_assumed, evictions_failed)
};
let started_at = std::time::Instant::now();
let evict_layers = async move {
let mut evict_layers = std::pin::pin!(evict_layers);
let maximum_expected = std::time::Duration::from_secs(10);
let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
let tuple = if let Ok(tuple) = res {
tuple
} else {
let elapsed = started_at.elapsed();
tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
evict_layers.await
};
let elapsed = started_at.elapsed();
tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
tuple
};
let evict_layers =
evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
let (usage_assumed, evictions_failed) = tokio::select! {
tuple = evict_layers => { tuple },
_ = cancel.cancelled() => {
@@ -763,6 +811,8 @@ async fn collect_eviction_candidates(
eviction_order: EvictionOrder,
cancel: &CancellationToken,
) -> anyhow::Result<EvictionCandidates> {
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
// get a snapshot of the list of tenants
let tenants = tenant::mgr::list_tenants()
.await
@@ -791,6 +841,8 @@ async fn collect_eviction_candidates(
continue;
}
let started_at = std::time::Instant::now();
// collect layers from all timelines in this tenant
//
// If one of the timelines becomes `!is_active()` during the iteration,
@@ -805,6 +857,7 @@ async fn collect_eviction_candidates(
}
let info = tl.get_local_layers_for_disk_usage_eviction().await;
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
tenant_candidates.extend(info.resident_layers.into_iter());
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
@@ -870,7 +923,25 @@ async fn collect_eviction_candidates(
(partition, candidate)
});
METRICS
.tenant_layer_count
.observe(tenant_candidates.len() as f64);
candidates.extend(tenant_candidates);
let elapsed = started_at.elapsed();
METRICS
.tenant_collection_time
.observe(elapsed.as_secs_f64());
if elapsed > LOG_DURATION_THRESHOLD {
tracing::info!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
elapsed_ms = elapsed.as_millis(),
"collection took longer than threshold"
);
}
}
// Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -885,11 +956,11 @@ async fn collect_eviction_candidates(
},
);
for secondary_tenant in secondary_tenants {
for tenant in secondary_tenants {
// for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
// to prevent repeated disk usage based evictions from completely draining less often
// updating secondaries.
let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
debug_assert!(
total_layers >= layer_info.resident_layers.len(),
@@ -897,6 +968,8 @@ async fn collect_eviction_candidates(
layer_info.resident_layers.len()
);
let started_at = std::time::Instant::now();
layer_info
.resident_layers
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -918,9 +991,27 @@ async fn collect_eviction_candidates(
)
});
METRICS
.tenant_layer_count
.observe(tenant_candidates.len() as f64);
candidates.extend(tenant_candidates);
tokio::task::yield_now().await;
let elapsed = started_at.elapsed();
METRICS
.tenant_collection_time
.observe(elapsed.as_secs_f64());
if elapsed > LOG_DURATION_THRESHOLD {
tracing::info!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
elapsed_ms = elapsed.as_millis(),
"collection took longer than threshold"
);
}
}
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -997,30 +1088,6 @@ impl<U: Usage> VictimSelection<U> {
}
}
struct TimelineKey(Arc<Timeline>);
impl PartialEq for TimelineKey {
fn eq(&self, other: &Self) -> bool {
Arc::ptr_eq(&self.0, &other.0)
}
}
impl Eq for TimelineKey {}
impl std::hash::Hash for TimelineKey {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
Arc::as_ptr(&self.0).hash(state);
}
}
impl std::ops::Deref for TimelineKey {
type Target = Timeline;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}
/// A totally ordered f32 subset we can use with sorting functions.
pub(crate) mod finite_f32 {

View File

@@ -579,6 +579,12 @@ paths:
required: false
schema:
type: integer
- name: lazy
in: query
required: false
schema:
type: boolean
description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
put:
description: |
Configures a _tenant location_, that is how a particular pageserver handles

View File

@@ -816,13 +816,7 @@ async fn tenant_attach_handler(
let tenant = state
.tenant_manager
.upsert_location(
tenant_shard_id,
location_conf,
None,
SpawnMode::Normal,
&ctx,
)
.upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
.await?;
let Some(tenant) = tenant else {
@@ -1418,6 +1412,7 @@ async fn put_tenant_location_config_handler(
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1448,15 +1443,17 @@ async fn put_tenant_location_config_handler(
let location_conf =
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
// lazy==true queues up for activation or jumps the queue like normal when a compute connects,
// similar to at startup ordering.
let spawn_mode = if lazy {
tenant::SpawnMode::Lazy
} else {
tenant::SpawnMode::Eager
};
let attached = state
.tenant_manager
.upsert_location(
tenant_shard_id,
location_conf,
flush,
tenant::SpawnMode::Normal,
&ctx,
)
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
.await?
.is_some();

View File

@@ -1915,17 +1915,16 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ =
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
}
let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
@@ -2474,6 +2473,64 @@ pub(crate) mod tenant_throttling {
}
}
pub(crate) mod disk_usage_based_eviction {
use super::*;
pub(crate) struct Metrics {
pub(crate) tenant_collection_time: Histogram,
pub(crate) tenant_layer_count: Histogram,
pub(crate) layers_collected: IntCounter,
pub(crate) layers_selected: IntCounter,
pub(crate) layers_evicted: IntCounter,
}
impl Default for Metrics {
fn default() -> Self {
let tenant_collection_time = register_histogram!(
"pageserver_disk_usage_based_eviction_tenant_collection_seconds",
"Time spent collecting layers from a tenant -- not normalized by collected layer amount",
vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
)
.unwrap();
let tenant_layer_count = register_histogram!(
"pageserver_disk_usage_based_eviction_tenant_collected_layers",
"Amount of layers gathered from a tenant",
vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
)
.unwrap();
let layers_collected = register_int_counter!(
"pageserver_disk_usage_based_eviction_collected_layers_total",
"Amount of layers collected"
)
.unwrap();
let layers_selected = register_int_counter!(
"pageserver_disk_usage_based_eviction_select_layers_total",
"Amount of layers selected"
)
.unwrap();
let layers_evicted = register_int_counter!(
"pageserver_disk_usage_based_eviction_evicted_layers_total",
"Amount of layers successfully evicted"
)
.unwrap();
Self {
tenant_collection_time,
tenant_layer_count,
layers_collected,
layers_selected,
layers_evicted,
}
}
}
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
}
pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting.
//
@@ -2508,6 +2565,7 @@ pub fn preinitialize_metrics() {
Lazy::force(&TENANT_MANAGER);
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
Lazy::force(&disk_usage_based_eviction::METRICS);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]

View File

@@ -73,7 +73,6 @@
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::{
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
Arc, Weak,
@@ -262,7 +261,9 @@ pub struct PageCache {
size_metrics: &'static PageCacheSizeMetrics,
}
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
struct PinnedSlotsPermit {
_permit: tokio::sync::OwnedSemaphorePermit,
}
///
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -558,9 +559,9 @@ impl PageCache {
)
.await
{
Ok(res) => Ok(PinnedSlotsPermit(
res.expect("this semaphore is never closed"),
)),
Ok(res) => Ok(PinnedSlotsPermit {
_permit: res.expect("this semaphore is never closed"),
}),
Err(_timeout) => {
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,

View File

@@ -27,7 +27,7 @@ use pageserver_api::models::{
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::ShardNumber;
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,7 +44,6 @@ use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
use tokio_util::sync::CancellationToken;
use tracing::field;
use tracing::*;
use utils::id::ConnectionId;
use utils::sync::gate::GateGuard;
@@ -1115,7 +1114,10 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = match self.get_cached_timeline_for_page(req) {
Ok(tl) => tl,
Ok(tl) => {
set_tracing_field_shard_id(tl);
tl
}
Err(key) => {
match self
.load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1140,9 +1142,6 @@ impl PageServerHandler {
}
};
// load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
set_tracing_field_shard_id(timeline);
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);

View File

@@ -37,7 +37,6 @@ impl Value {
mod test {
use super::*;
use bytes::Bytes;
use utils::bin_ser::BeSer;
macro_rules! roundtrip {

View File

@@ -109,7 +109,6 @@ pub use pageserver_api::models::TenantState;
use tokio::sync::Semaphore;
static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
use toml_edit;
use utils::{
crashsafe,
generation::Generation,
@@ -227,7 +226,11 @@ pub(crate) struct TenantPreload {
/// When we spawn a tenant, there is a special mode for tenant creation that
/// avoids trying to read anything from remote storage.
pub(crate) enum SpawnMode {
Normal,
/// Activate as soon as possible
Eager,
/// Lazy activation in the background, with the option to skip the queue if the need comes up
Lazy,
/// Tenant has been created during the lifetime of this process
Create,
}
@@ -700,41 +703,37 @@ impl Tenant {
.and_then(|x| x.initial_tenant_load_remote.take());
enum AttachType<'a> {
// During pageserver startup, we are attaching this tenant lazily in the background
Warmup(tokio::sync::SemaphorePermit<'a>),
// During pageserver startup, we are attaching this tenant as soon as we can,
// because a client tried to access it.
/// We are attaching this tenant lazily in the background.
Warmup {
_permit: tokio::sync::SemaphorePermit<'a>,
during_startup: bool
},
/// We are attaching this tenant as soon as we can, because for example an
/// endpoint tried to access it.
OnDemand,
// During normal operations after startup, we are attaching a tenant.
/// During normal operations after startup, we are attaching a tenant, and
/// eager attach was requested.
Normal,
}
// Before doing any I/O, wait for either or:
// - A client to attempt to access to this tenant (on-demand loading)
// - A permit to become available in the warmup semaphore (background warmup)
//
// Some-ness of init_order is how we know if we're attaching during startup or later
// in process lifetime.
let attach_type = if init_order.is_some() {
let attach_type = if matches!(mode, SpawnMode::Lazy) {
// Before doing any I/O, wait for at least one of:
// - A client attempting to access to this tenant (on-demand loading)
// - A permit becoming available in the warmup semaphore (background warmup)
tokio::select!(
_ = tenant_clone.activate_now_sem.acquire() => {
permit = tenant_clone.activate_now_sem.acquire() => {
let _ = permit.expect("activate_now_sem is never closed");
tracing::info!("Activating tenant (on-demand)");
AttachType::OnDemand
},
permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
match permit_result {
Ok(p) => {
tracing::info!("Activating tenant (warmup)");
AttachType::Warmup(p)
}
Err(_) => {
// This is unexpected: the warmup semaphore should stay alive
// for the lifetime of init_order. Log a warning and proceed.
tracing::warn!("warmup_limit semaphore unexpectedly closed");
AttachType::Normal
}
permit = conf.concurrent_tenant_warmup.inner().acquire() => {
let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
tracing::info!("Activating tenant (warmup)");
AttachType::Warmup {
_permit,
during_startup: init_order.is_some()
}
}
_ = tenant_clone.cancel.cancelled() => {
// This is safe, but should be pretty rare: it is interesting if a tenant
@@ -749,6 +748,8 @@ impl Tenant {
},
)
} else {
// SpawnMode::{Create,Eager} always cause jumping ahead of the
// concurrent_tenant_warmup queue
AttachType::Normal
};
@@ -756,7 +757,7 @@ impl Tenant {
(SpawnMode::Create, _) => {
None
},
(SpawnMode::Normal, Some(remote_storage)) => {
(SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
let _preload_timer = TENANT.preload.start_timer();
let res = tenant_clone
.preload(remote_storage, task_mgr::shutdown_token())
@@ -769,7 +770,7 @@ impl Tenant {
}
}
}
(SpawnMode::Normal, None) => {
(_, None) => {
let _preload_timer = TENANT.preload.start_timer();
None
}
@@ -828,7 +829,7 @@ impl Tenant {
let attached = {
let _attach_timer = match mode {
SpawnMode::Create => None,
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
};
tenant_clone.attach(preload, mode, &ctx).await
};
@@ -850,7 +851,7 @@ impl Tenant {
// It also prevents the warmup proccess competing with the concurrency limit on
// logical size calculations: if logical size calculation semaphore is saturated,
// then warmup will wait for that before proceeding to the next tenant.
if let AttachType::Warmup(_permit) = attach_type {
if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
tracing::info!("Waiting for initial logical sizes while warming up...");
while futs.next().await.is_some() {}
@@ -923,7 +924,7 @@ impl Tenant {
deleting: false,
timelines: HashMap::new(),
},
(None, SpawnMode::Normal) => {
(None, _) => {
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
}
};
@@ -2382,7 +2383,7 @@ impl Tenant {
self.tenant_shard_id,
self.generation,
self.shard_identity,
self.walredo_mgr.as_ref().map(Arc::clone),
self.walredo_mgr.clone(),
resources,
pg_version,
state,
@@ -3591,25 +3592,18 @@ pub async fn dump_layerfile_from_path(
#[cfg(test)]
pub(crate) mod harness {
use bytes::{Bytes, BytesMut};
use camino::Utf8PathBuf;
use once_cell::sync::OnceCell;
use pageserver_api::models::ShardParameters;
use pageserver_api::shard::ShardIndex;
use std::fs;
use std::sync::Arc;
use utils::logging;
use utils::lsn::Lsn;
use crate::deletion_queue::mock::MockDeletionQueue;
use crate::walredo::apply_neon;
use crate::{
config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
};
use crate::{repository::Key, walrecord::NeonWalRecord};
use super::*;
use crate::tenant::config::{TenantConf, TenantConfOpt};
use hex_literal::hex;
use utils::id::{TenantId, TimelineId};
use utils::id::TenantId;
pub const TIMELINE_ID: TimelineId =
TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3769,7 +3763,7 @@ pub(crate) mod harness {
let preload = tenant
.preload(&self.remote_storage, CancellationToken::new())
.await?;
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
tenant.state.send_replace(TenantState::Active);
for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3838,10 +3832,8 @@ mod tests {
use crate::DEFAULT_PG_VERSION;
use bytes::BytesMut;
use hex_literal::hex;
use once_cell::sync::Lazy;
use pageserver_api::keyspace::KeySpace;
use rand::{thread_rng, Rng};
use tokio_util::sync::CancellationToken;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));

View File

@@ -52,7 +52,10 @@ pub mod defaults {
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
// throughputs up to 1GiB/s per timeline.
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

View File

@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
.expect("cant be stopping or broken");
tenant
.attach(preload, super::SpawnMode::Normal, ctx)
.attach(preload, super::SpawnMode::Eager, ctx)
.await
.context("attach")?;

View File

@@ -21,7 +21,6 @@
use byteorder::{ReadBytesExt, BE};
use bytes::{BufMut, Bytes, BytesMut};
use either::Either;
use hex;
use std::{cmp::Ordering, io, result};
use thiserror::Error;
use tracing::error;
@@ -700,8 +699,6 @@ impl<const L: usize> BuildNode<L> {
#[cfg(test)]
pub(crate) mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
use rand::Rng;
use std::collections::BTreeMap;

View File

@@ -300,7 +300,7 @@ mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
use crate::tenant::block_io::BlockReaderRef;
use rand::{thread_rng, RngCore};
use std::fs;
use std::str::FromStr;

View File

@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
shard_identity,
Some(init_order.clone()),
&TENANTS,
SpawnMode::Normal,
SpawnMode::Lazy,
&ctx,
) {
Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
// the caller thinks they're creating but the tenant already existed. We must switch to
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
// Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
// rather than assuming it to be empty.
spawn_mode = SpawnMode::Normal;
spawn_mode = SpawnMode::Eager;
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
shard_identity,
None,
self.tenants,
SpawnMode::Normal,
SpawnMode::Eager,
ctx,
)?;
@@ -1521,7 +1521,7 @@ impl TenantManager {
*child_shard,
child_location_conf,
None,
SpawnMode::Normal,
SpawnMode::Eager,
ctx,
)
.await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
shard_identity,
None,
&TENANTS,
SpawnMode::Normal,
SpawnMode::Eager,
ctx,
)
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(
let tenant = guard
.get(&tenant_shard_id)
.map(Arc::clone)
.cloned()
.with_context(|| format!("tenant {tenant_shard_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?;

View File

@@ -1791,14 +1791,12 @@ mod tests {
context::RequestContext,
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::Layer,
Generation, Tenant, Timeline,
Tenant, Timeline,
},
DEFAULT_PG_VERSION,
};
use std::collections::HashSet;
use utils::lsn::Lsn;
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
format!("contents for {name}").into()

View File

@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub fn is_temp_download_file(path: &Utf8Path) -> bool {
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
let extension = path.extension();
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,

View File

@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};
use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
enum DownloadCommand {
Download(TenantShardId),
@@ -121,6 +121,10 @@ impl SecondaryTenant {
})
}
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
self.tenant_shard_id
}
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();
@@ -164,16 +168,17 @@ impl SecondaryTenant {
self.detail.lock().unwrap().get_layers_for_eviction(self)
}
/// Cancellation safe, but on cancellation the eviction will go through
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
pub(crate) async fn evict_layer(
&self,
self: &Arc<Self>,
conf: &PageServerConf,
timeline_id: TimelineId,
name: LayerFileName,
) {
debug_assert_current_span_has_tenant_id();
let _guard = match self.gate.enter() {
let guard = match self.gate.enter() {
Ok(g) => g,
Err(_) => {
tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
@@ -187,35 +192,57 @@ impl SecondaryTenant {
.timeline_path(&self.tenant_shard_id, &timeline_id)
.join(name.file_name());
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
tokio::fs::remove_file(path)
.await
.or_else(fs_ext::ignore_not_found)
.fatal_err("Deleting layer during eviction");
let this = self.clone();
// Update the timeline's state. This does not have to be synchronized with
// the download process, because:
// - If downloader is racing with us to remove a file (e.g. because it is
// removed from heatmap), then our mutual .remove() operations will both
// succeed.
// - If downloader is racing with us to download the object (this would require
// multiple eviction iterations to race with multiple download iterations), then
// if we remove it from the state, the worst that happens is the downloader
// downloads it again before re-inserting, or we delete the file but it remains
// in the state map (in which case it will be downloaded if this secondary
// tenant transitions to attached and tries to access it)
//
// The important assumption here is that the secondary timeline state does not
// have to 100% match what is on disk, because it's a best-effort warming
// of the cache.
let mut detail = self.detail.lock().unwrap();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
timeline_detail.on_disk_layers.remove(&name);
timeline_detail.evicted_at.insert(name, now);
}
// spawn it to be cancellation safe
tokio::task::spawn_blocking(move || {
let _guard = guard;
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
let deleted = std::fs::remove_file(path);
let not_found = deleted
.as_ref()
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
let deleted = if not_found {
false
} else {
deleted
.map(|()| true)
.fatal_err("Deleting layer during eviction")
};
if !deleted {
// skip updating accounting and putting perhaps later timestamp
return;
}
// Update the timeline's state. This does not have to be synchronized with
// the download process, because:
// - If downloader is racing with us to remove a file (e.g. because it is
// removed from heatmap), then our mutual .remove() operations will both
// succeed.
// - If downloader is racing with us to download the object (this would require
// multiple eviction iterations to race with multiple download iterations), then
// if we remove it from the state, the worst that happens is the downloader
// downloads it again before re-inserting, or we delete the file but it remains
// in the state map (in which case it will be downloaded if this secondary
// tenant transitions to attached and tries to access it)
//
// The important assumption here is that the secondary timeline state does not
// have to 100% match what is on disk, because it's a best-effort warming
// of the cache.
let mut detail = this.detail.lock().unwrap();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
timeline_detail.on_disk_layers.remove(&name);
timeline_detail.evicted_at.insert(name, now);
}
})
.await
.expect("secondary eviction should not have panicked");
}
}

View File

@@ -16,7 +16,8 @@ use crate::{
config::SecondaryLocationConfig,
debug_assert_current_span_has_tenant_and_timeline_id,
remote_timeline_client::{
index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::LayerFileName,
@@ -788,7 +789,7 @@ async fn init_timeline_state(
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
continue;
} else if crate::is_temporary(&file_path) {
} else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
// Temporary files are frequently left behind from restarting during downloads
tracing::info!("Cleaning up temporary file {file_path}");
if let Err(e) = tokio::fs::remove_file(&file_path)

View File

@@ -18,7 +18,6 @@ use crate::{
};
use futures::Future;
use md5;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};

View File

@@ -72,7 +72,7 @@ where
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
/// call, to collect more records.
///
#[derive(Debug)]
#[derive(Debug, Default)]
pub struct ValueReconstructState {
pub records: Vec<(Lsn, NeonWalRecord)>,
pub img: Option<(Lsn, Bytes)>,

View File

@@ -43,7 +43,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;

View File

@@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex;
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
use std::time::SystemTime;
use std::time::{Duration, SystemTime};
use tracing::Instrument;
use utils::lsn::Lsn;
use utils::sync::heavier_once_cell;
@@ -208,10 +208,15 @@ impl Layer {
/// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
/// re-downloaded, [`EvictionError::Downloaded`] is returned.
///
/// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
/// will happen regardless the future returned by this method completing unless there is a
/// read access (currently including [`Layer::keep_resident`]) before eviction gets to
/// complete.
///
/// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
/// of download-evict cycle on retry.
pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
self.0.evict_and_wait().await
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
self.0.evict_and_wait(timeout).await
}
/// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
@@ -363,7 +368,7 @@ impl Layer {
///
/// Does not start local deletion, use [`Self::delete_on_drop`] for that
/// separatedly.
#[cfg(feature = "testing")]
#[cfg(any(feature = "testing", test))]
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
let mut rx = self.0.status.subscribe();
@@ -632,7 +637,7 @@ impl LayerInner {
/// Cancellation safe, however dropping the future and calling this method again might result
/// in a new attempt to evict OR join the previously started attempt.
pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
use tokio::sync::broadcast::error::RecvError;
assert!(self.have_remote_client);
@@ -652,16 +657,22 @@ impl LayerInner {
if strong.is_some() {
// drop the DownloadedLayer outside of the holding the guard
drop(strong);
// idea here is that only one evicter should ever get to witness a strong reference,
// which means whenever get_or_maybe_download upgrades a weak, it must mark up a
// cancelled eviction and signal us, like it currently does.
//
// a second concurrent evict_and_wait will not see a strong reference.
LAYER_IMPL_METRICS.inc_started_evictions();
}
match rx.recv().await {
Ok(Status::Evicted) => Ok(()),
Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
Err(RecvError::Closed) => {
match tokio::time::timeout(timeout, rx.recv()).await {
Ok(Ok(Status::Evicted)) => Ok(()),
Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
Ok(Err(RecvError::Closed)) => {
unreachable!("sender cannot be dropped while we are in &self method")
}
Err(RecvError::Lagged(_)) => {
Ok(Err(RecvError::Lagged(_))) => {
// this is quite unlikely, but we are blocking a lot in the async context, so
// we might be missing this because we are stuck on a LIFO slot on a thread
// which is busy blocking for a 1TB database create_image_layers.
@@ -674,6 +685,7 @@ impl LayerInner {
None => Ok(()),
}
}
Err(_timeout) => Err(EvictionError::Timeout),
}
}
@@ -1195,6 +1207,9 @@ pub(crate) enum EvictionError {
/// Evictions must always lose to downloads in races, and this time it happened.
#[error("layer was downloaded instead")]
Downloaded,
#[error("eviction did not happen within timeout")]
Timeout,
}
/// Error internal to the [`LayerInner::get_or_maybe_download`]

View File

@@ -1,13 +1,173 @@
use futures::StreamExt;
use pageserver_api::key::CONTROLFILE_KEY;
use tokio::task::JoinSet;
use tracing::Instrument;
use utils::{
completion::{self, Completion},
id::TimelineId,
};
use super::*;
use crate::task_mgr::BACKGROUND_RUNTIME;
use crate::tenant::harness::TenantHarness;
use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
/// Used in tests to advance a future to wanted await point, and not futher.
const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE
/// timeout uses to advance futures.
const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7);
/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
#[tokio::test]
async fn smoke_test() {
let handle = BACKGROUND_RUNTIME.handle();
let h = TenantHarness::create("smoke_test").unwrap();
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let (tenant, _) = h.load().await;
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.resident_layers().collect::<Vec<_>>().await
};
assert_eq!(layers.len(), 1);
layers.swap_remove(0)
};
// all layers created at pageserver are like `layer`, initialized with strong
// Arc<DownloadedLayer>.
let img_before = {
let mut data = ValueReconstructState::default();
layer
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
.await
.unwrap();
data.img
.take()
.expect("tenant harness writes the control file")
};
// important part is evicting the layer, which can be done when there are no more ResidentLayer
// instances -- there currently are none, only two `Layer` values, one in the layermap and on
// in scope.
layer.evict_and_wait(FOREVER).await.unwrap();
// double-evict returns an error, which is valid if both eviction_task and disk usage based
// eviction would both evict the same layer at the same time.
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
assert!(matches!(e, EvictionError::NotFound));
// on accesses when the layer is evicted, it will automatically be downloaded.
let img_after = {
let mut data = ValueReconstructState::default();
layer
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
.instrument(download_span.clone())
.await
.unwrap();
data.img.take().unwrap()
};
assert_eq!(img_before, img_after);
// evict_and_wait can timeout, but it doesn't cancel the evicting itself
//
// ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
// artificially slow it down.
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
match layer
.evict_and_wait(std::time::Duration::ZERO)
.await
.unwrap_err()
{
EvictionError::Timeout => {
// expected, but note that the eviction is "still ongoing"
helper.release().await;
// exhaust spawn_blocking pool to ensure it is now complete
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
.await;
}
other => unreachable!("{other:?}"),
}
// only way to query if a layer is resident is to acquire a ResidentLayer instance.
// Layer::keep_resident never downloads, but it might initialize if the layer file is found
// downloaded locally.
let none = layer.keep_resident().await.unwrap();
assert!(
none.is_none(),
"Expected none, because eviction removed the local file, found: {none:?}"
);
// plain downloading is rarely needed
layer
.download_and_keep_resident()
.instrument(download_span)
.await
.unwrap();
// last important part is deletion on drop: gc and compaction use it for compacted L0 layers
// or fully garbage collected layers. deletion means deleting the local file, and scheduling a
// deletion of the already unlinked from index_part.json remote file.
//
// marking a layer to be deleted on drop is irreversible; there is no technical reason against
// reversiblity, but currently it is not needed so it is not provided.
layer.delete_on_drop();
let path = layer.local_path().to_owned();
// wait_drop produces an unconnected to Layer future which will resolve when the
// LayerInner::drop has completed.
let mut wait_drop = std::pin::pin!(layer.wait_drop());
// paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing
// until here
tokio::time::pause();
tokio::time::timeout(ADVANCE, &mut wait_drop)
.await
.expect_err("should had timed out because two strong references exist");
tokio::fs::metadata(&path)
.await
.expect("the local layer file still exists");
let rtc = timeline.remote_client.as_ref().unwrap();
{
let layers = &[layer];
let mut g = timeline.layers.write().await;
g.finish_gc_timeline(layers);
// this just updates the remote_physical_size for demonstration purposes
rtc.schedule_gc_update(layers).unwrap();
}
// when strong references are dropped, the file is deleted and remote deletion is scheduled
wait_drop.await;
let e = tokio::fs::metadata(&path)
.await
.expect_err("the local file is deleted");
assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
rtc.wait_completion().await.unwrap();
assert_eq!(rtc.get_remote_physical_size(), 0);
}
/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
/// time. Now both of them complete per Arc drop semantics.
@@ -41,10 +201,10 @@ async fn evict_and_wait_on_wanted_deleted() {
let resident = layer.keep_resident().await.unwrap();
{
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
// drive the future to await on the status channel
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
@@ -115,10 +275,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
let resident = layer.keep_resident().await.unwrap();
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
// drive the future to await on the status channel
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
@@ -138,7 +298,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
// because the keep_resident check alters wanted evicted without sending a message, we will
// never get completed
let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
.await
.expect("no timeout, because keep_resident re-initialized")
.expect_err("eviction should not have succeeded because re-initialized");
@@ -158,9 +318,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
.sum::<u64>()
);
let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
// advance to the wait on the queue
tokio::time::timeout(ADVANCE, &mut second_eviction)
.await
.expect_err("timeout because spawn_blocking is clogged");
@@ -171,7 +332,12 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
helper.release().await;
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
// the second_eviction gets to run here
//
// synchronize to be *strictly* after the second_eviction spawn_blocking run
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
tokio::time::timeout(ADVANCE, &mut second_eviction)
.await
.expect("eviction goes through now that spawn_blocking is unclogged")
.expect("eviction should succeed, because version matches");
@@ -261,3 +427,49 @@ impl SpawnBlockingPoolHelper {
.await
}
}
#[test]
fn spawn_blocking_pool_helper_actually_works() {
// create a custom runtime for which we know and control how many blocking threads it has
//
// because the amount is not configurable for our helper, expect the same amount as
// BACKGROUND_RUNTIME using the tokio defaults would have.
let rt = tokio::runtime::Builder::new_current_thread()
.max_blocking_threads(512)
.enable_all()
.build()
.unwrap();
let handle = rt.handle();
rt.block_on(async move {
// this will not return until all threads are spun up and actually executing the code
// waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
println!("consumed");
let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || {
// this will not get to run before we release
}));
println!("spawned");
tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh)
.await
.expect_err("the task should not have gotten to run yet");
println!("tried to join");
consumed.release().await;
println!("released");
tokio::time::timeout(std::time::Duration::from_secs(1), jh)
.await
.expect("no timeout")
.expect("no join error");
println!("joined");
});
}

View File

@@ -1512,10 +1512,14 @@ impl Timeline {
return Ok(None);
};
match local_layer.evict_and_wait().await {
// curl has this by default
let timeout = std::time::Duration::from_secs(120);
match local_layer.evict_and_wait(timeout).await {
Ok(()) => Ok(Some(true)),
Err(EvictionError::NotFound) => Ok(Some(false)),
Err(EvictionError::Downloaded) => Ok(Some(false)),
Err(EvictionError::Timeout) => Ok(Some(false)),
}
}
}
@@ -5157,8 +5161,7 @@ mod tests {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let ctx = any_context();
let tenant = harness.do_try_load(&ctx).await.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
@@ -5172,8 +5175,10 @@ mod tests {
.expect("should had been resident")
.drop_eviction_guard();
let first = async { layer.evict_and_wait().await };
let second = async { layer.evict_and_wait().await };
let forever = std::time::Duration::from_secs(120);
let first = layer.evict_and_wait(forever);
let second = layer.evict_and_wait(forever);
let (first, second) = tokio::join!(first, second);
@@ -5192,12 +5197,6 @@ mod tests {
}
}
fn any_context() -> crate::context::RequestContext {
use crate::context::*;
use crate::task_mgr::*;
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
async fn find_some_layer(timeline: &Timeline) -> Layer {
let layers = timeline.layers.read().await;
let desc = layers

View File

@@ -75,14 +75,13 @@ impl Timeline {
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
let ctx_adaptor = RequestContextAdaptor(ctx.clone());
pageserver_compaction::compact_tiered::compact_tiered(
&mut adaptor,
end_lsn,
target_file_size,
fanout,
&ctx_adaptor,
ctx,
)
.await?;
@@ -143,13 +142,13 @@ impl CompactionJobExecutor for TimelineAdaptor {
type DeltaLayer = ResidentDeltaLayer;
type ImageLayer = ResidentImageLayer;
type RequestContext = RequestContextAdaptor;
type RequestContext = crate::context::RequestContext;
async fn get_layers(
&mut self,
key_range: &Range<Key>,
lsn_range: &Range<Lsn>,
_ctx: &RequestContextAdaptor,
_ctx: &RequestContext,
) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
self.flush_updates().await?;
@@ -170,7 +169,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
&mut self,
key_range: &Range<Key>,
lsn: Lsn,
_ctx: &RequestContextAdaptor,
_ctx: &RequestContext,
) -> anyhow::Result<Vec<Range<Key>>> {
if lsn == self.keyspace.0 {
Ok(pageserver_compaction::helpers::intersect_keyspace(
@@ -206,7 +205,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
&mut self,
lsn: Lsn,
key_range: &Range<Key>,
ctx: &RequestContextAdaptor,
ctx: &RequestContext,
) -> anyhow::Result<()> {
Ok(self.create_image_impl(lsn, key_range, ctx).await?)
}
@@ -216,7 +215,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
lsn_range: &Range<Lsn>,
key_range: &Range<Key>,
input_layers: &[ResidentDeltaLayer],
ctx: &RequestContextAdaptor,
ctx: &RequestContext,
) -> anyhow::Result<()> {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
@@ -287,7 +286,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
async fn delete_layer(
&mut self,
layer: &OwnArc<PersistentLayerDesc>,
_ctx: &RequestContextAdaptor,
_ctx: &RequestContext,
) -> anyhow::Result<()> {
self.layers_to_delete.push(layer.clone().0);
Ok(())
@@ -299,7 +298,7 @@ impl TimelineAdaptor {
&mut self,
lsn: Lsn,
key_range: &Range<Key>,
ctx: &RequestContextAdaptor,
ctx: &RequestContext,
) -> Result<(), PageReconstructError> {
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
@@ -361,17 +360,7 @@ impl TimelineAdaptor {
}
}
pub struct RequestContextAdaptor(pub RequestContext);
impl std::ops::Deref for RequestContextAdaptor {
type Target = RequestContext;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl CompactionRequestContext for RequestContextAdaptor {}
impl CompactionRequestContext for crate::context::RequestContext {}
#[derive(Debug, Clone)]
pub struct OwnArc<T>(pub Arc<T>);
@@ -449,10 +438,7 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
type DeltaEntry<'a> = DeltaEntry<'a>;
async fn load_keys<'a>(
&self,
ctx: &RequestContextAdaptor,
) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
self.0.load_keys(ctx).await
}
}

View File

@@ -204,6 +204,7 @@ impl Timeline {
evicted: usize,
errors: usize,
not_evictable: usize,
timeouts: usize,
#[allow(dead_code)]
skipped_for_shutdown: usize,
}
@@ -267,7 +268,11 @@ impl Timeline {
let layer = guard.drop_eviction_guard();
if no_activity_for > p.threshold {
// this could cause a lot of allocations in some cases
js.spawn(async move { layer.evict_and_wait().await });
js.spawn(async move {
layer
.evict_and_wait(std::time::Duration::from_secs(5))
.await
});
stats.candidates += 1;
}
}
@@ -280,6 +285,9 @@ impl Timeline {
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
}
Ok(Err(EvictionError::Timeout)) => {
stats.timeouts += 1;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
/* already logged */
@@ -295,7 +303,8 @@ impl Timeline {
stats = join_all => {
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 {
} else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
// reminder: timeouts are not eviction cancellations
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");

View File

@@ -1667,8 +1667,6 @@ mod tests {
use super::*;
use crate::tenant::harness::*;
use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
use crate::tenant::Timeline;
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
use postgres_ffi::RELSEG_SIZE;
use crate::DEFAULT_PG_VERSION;

View File

@@ -252,8 +252,6 @@ mod test {
use super::*;
use std::collections::HashMap;
use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
/// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
#[test]
fn apply_aux_file_deltas() -> anyhow::Result<()> {

View File

@@ -1,7 +1,5 @@
use tracing;
use tracing::error;
use tracing::info;
use tracing::instrument;
use tracing::{error, info};
use crate::metrics::WalRedoKillCause;
use crate::metrics::WAL_REDO_PROCESS_COUNTERS;

View File

@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
SHLIB_LINK = -lcurl
EXTENSION = neon
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
PGFILEDESC = "neon - cloud storage for PostgreSQL"
EXTRA_CLEAN = \

View File

@@ -25,6 +25,8 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pagestore_client.h"
#include "common/hashfn.h"
#include "lib/hyperloglog.h"
#include "pgstat.h"
#include "postmaster/bgworker.h"
#include RELFILEINFO_HDR
@@ -60,6 +62,7 @@
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
#define MB ((uint64)1024*1024)
#define HYPER_LOG_LOG_BIT_WIDTH 10
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
typedef struct FileCacheEntry
@@ -84,6 +87,8 @@ typedef struct FileCacheControl
uint64 writes;
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
hyperLogLogState wss_estimation; /* estimation of wroking set size */
uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
} FileCacheControl;
static HTAB *lfc_hash;
@@ -232,6 +237,14 @@ lfc_shmem_startup(void)
lfc_ctl->writes = 0;
dlist_init(&lfc_ctl->lru);
/* Initialize hyper-log-log structure for estimating working set size */
initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
/* We need hashes in shared memory */
pfree(lfc_ctl->wss_estimation.hashesArr);
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
/* Recreate file cache on restart */
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
if (fd < 0)
@@ -529,6 +542,11 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
}
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
/* Approximate working set */
tag.blockNum = blkno;
addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
{
/* Page is not cached */
@@ -967,3 +985,21 @@ local_cache_pages(PG_FUNCTION_ARGS)
else
SRF_RETURN_DONE(funcctx);
}
PG_FUNCTION_INFO_V1(approximate_working_set_size);
Datum
approximate_working_set_size(PG_FUNCTION_ARGS)
{
int32 dc = -1;
if (lfc_size_limit != 0)
{
bool reset = PG_GETARG_BOOL(0);
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
if (reset)
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
LWLockRelease(lfc_lock);
}
PG_RETURN_INT32(dc);
}

View File

@@ -0,0 +1,9 @@
\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit
CREATE FUNCTION approximate_working_set_size(reset bool)
RETURNS integer
AS 'MODULE_PATHNAME', 'approximate_working_set_size'
LANGUAGE C PARALLEL SAFE;
GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor;

View File

@@ -1,6 +1,6 @@
# neon extension
comment = 'cloud storage for PostgreSQL'
default_version = '1.2'
default_version = '1.3'
module_pathname = '$libdir/neon'
relocatable = true
trusted = true

View File

@@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled;
use tokio::net::TcpListener;
use anyhow::{anyhow, bail, ensure, Context};
use clap::{self, Arg};
use clap::Arg;
use futures::TryFutureExt;
use proxy::console::messages::MetricsAuxInfo;
use proxy::stream::{PqStream, Stream};

View File

@@ -358,8 +358,7 @@ impl Cache for ProjectInfoCacheImpl {
#[cfg(test)]
mod tests {
use super::*;
use crate::{console::AuthSecret, scram::ServerSecret};
use std::{sync::Arc, time::Duration};
use crate::scram::ServerSecret;
#[tokio::test]
async fn test_project_info_cache_settings() {

View File

@@ -4,7 +4,7 @@ use crate::{
};
use anyhow::Context;
use once_cell::sync::Lazy;
use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
use std::{convert::Infallible, future};
use tokio::net::{TcpListener, TcpStream};

View File

@@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken
use crate::console::{self, CachedNodeInfo, NodeInfo};
use crate::error::ErrorKind;
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
use crate::{auth, http, sasl, scram};
use crate::{http, sasl, scram};
use anyhow::{bail, Context};
use async_trait::async_trait;
use rstest::rstest;

View File

@@ -11,7 +11,6 @@ use bytes::{Bytes, BytesMut};
use futures::{SinkExt, StreamExt};
use postgres_protocol::message::frontend;
use tokio::io::{AsyncReadExt, DuplexStream};
use tokio_postgres::config::SslMode;
use tokio_postgres::tls::TlsConnect;
use tokio_util::codec::{Decoder, Encoder};

View File

@@ -667,7 +667,6 @@ impl<C: ClientInnerExt> Drop for Client<C> {
#[cfg(test)]
mod tests {
use env_logger;
use std::{mem, sync::atomic::AtomicBool};
use super::*;

View File

@@ -19,8 +19,6 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
use crate::SafeKeeperConf;
use std::convert::TryInto;
pub const SK_MAGIC: u32 = 0xcafeceefu32;
pub const SK_FORMAT_VERSION: u32 = 7;
@@ -219,12 +217,9 @@ impl Storage for FileStorage {
#[cfg(test)]
mod test {
use super::FileStorage;
use super::*;
use crate::SafeKeeperConf;
use anyhow::Result;
use tokio::fs;
use utils::{id::TenantTimelineId, lsn::Lsn};
use utils::lsn::Lsn;
fn stub_conf() -> SafeKeeperConf {
let workdir = camino_tempfile::tempdir().unwrap().into_path();

View File

@@ -2,8 +2,7 @@
//! protocol commands.
use anyhow::Context;
use std::str::FromStr;
use std::str::{self};
use std::str::{self, FromStr};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{debug, info, info_span, Instrument};
@@ -16,8 +15,8 @@ use crate::safekeeper::Term;
use crate::timeline::TimelineError;
use crate::wal_service::ConnectionId;
use crate::{GlobalTimelines, SafeKeeperConf};
use postgres_backend::PostgresBackend;
use postgres_backend::QueryError;
use postgres_backend::{self, PostgresBackend};
use postgres_ffi::PG_TLI;
use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
use regex::Regex;

View File

@@ -2180,6 +2180,11 @@ class NeonAttachmentService(MetricsGetter):
self.stop(immediate=True)
@dataclass
class LogCursor:
_line_no: int
class NeonPageserver(PgProtocol):
"""
An object representing a running pageserver.
@@ -2343,7 +2348,18 @@ class NeonPageserver(PgProtocol):
value = self.http_client().get_metric_value(metric)
assert value == 0, f"Nonzero {metric} == {value}"
def log_contains(self, pattern: str) -> Optional[str]:
def assert_log_contains(
self, pattern: str, offset: None | LogCursor = None
) -> Tuple[str, LogCursor]:
"""Convenient for use inside wait_until()"""
res = self.log_contains(pattern, offset=offset)
assert res is not None
return res
def log_contains(
self, pattern: str, offset: None | LogCursor = None
) -> Optional[Tuple[str, LogCursor]]:
"""Check that the pageserver log contains a line that matches the given regex"""
logfile = self.workdir / "pageserver.log"
if not logfile.exists():
@@ -2357,12 +2373,17 @@ class NeonPageserver(PgProtocol):
# no guarantee it is already present in the log file. This hasn't
# been a problem in practice, our python tests are not fast enough
# to hit that race condition.
skip_until_line_no = 0 if offset is None else offset._line_no
cur_line_no = 0
with logfile.open("r") as f:
for line in f:
if cur_line_no < skip_until_line_no:
cur_line_no += 1
continue
if contains_re.search(line):
# found it!
return line
cur_line_no += 1
return (line, LogCursor(cur_line_no))
return None
def tenant_attach(

View File

@@ -286,7 +286,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
self.verbose_error(res)
def tenant_location_conf(
self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
self,
tenant_id: Union[TenantId, TenantShardId],
location_conf=dict[str, Any],
flush_ms=None,
lazy: Optional[bool] = None,
):
body = location_conf.copy()
body["tenant_id"] = str(tenant_id)
@@ -295,6 +299,9 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
if flush_ms is not None:
params["flush_ms"] = str(flush_ms)
if lazy is not None:
params["lazy"] = "true" if lazy else "false"
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
json=body,

View File

@@ -20,7 +20,7 @@ def assert_tenant_state(
tenant: TenantId,
expected_state: str,
message: Optional[str] = None,
):
) -> None:
tenant_status = pageserver_http.tenant_status(tenant)
log.info(f"tenant_status: {tenant_status}")
assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
@@ -206,8 +206,8 @@ def wait_for_last_record_lsn(
return current_lsn
if i % 10 == 0:
log.info(
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
lsn, current_lsn, i + 1
"{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
tenant, timeline, lsn, current_lsn, i + 1
)
)
time.sleep(0.1)
@@ -292,7 +292,7 @@ def timeline_delete_wait_completed(
iterations: int = 20,
interval: Optional[float] = None,
**delete_args,
):
) -> None:
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
@@ -302,7 +302,7 @@ def assert_prefix_empty(
remote_storage: Optional[RemoteStorage],
prefix: Optional[str] = None,
allowed_postfix: Optional[str] = None,
):
) -> None:
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
keys = response["KeyCount"]

View File

@@ -369,7 +369,12 @@ def start_in_background(
return spawned_process
def wait_until(number_of_iterations: int, interval: float, func: Fn):
WaitUntilRet = TypeVar("WaitUntilRet")
def wait_until(
number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
) -> WaitUntilRet:
"""
Wait until 'func' returns successfully, without exception. Returns the
last return value from the function.
@@ -387,6 +392,18 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
raise Exception("timed out while waiting for %s" % func) from last_exception
def assert_eq(a, b) -> None:
assert a == b
def assert_gt(a, b) -> None:
assert a > b
def assert_ge(a, b) -> None:
assert a >= b
def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
"""
Fast way to populate data.

View File

@@ -63,10 +63,11 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
]
)
def log_contains_bad_request():
env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
wait_until(50, 0.1, log_contains_bad_request)
wait_until(
50,
0.1,
lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
)
def test_null_body(negative_env: NegativeTests):

View File

@@ -200,7 +200,7 @@ class EvictionEnv:
tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
def statvfs_called():
assert pageserver.log_contains(".*running mocked statvfs.*")
pageserver.assert_log_contains(".*running mocked statvfs.*")
# we most likely have already completed multiple runs
wait_until(10, 1, statvfs_called)
@@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
assert actual_change >= target, "eviction must always evict more than target"
time.sleep(1) # give log time to flush
assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE)
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
@@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
@@ -801,10 +801,9 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
def relieved_log_message():
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
wait_until(10, 1, relieved_log_message)
wait_until(
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
)
def less_than_max_usage_pct():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
@@ -845,10 +844,9 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
def relieved_log_message():
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
wait_until(10, 1, relieved_log_message)
wait_until(
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
)
def more_than_min_avail_bytes_freed():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)

View File

@@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
time.sleep(10) # let compaction to be performed
assert env.pageserver.log_contains("compact-level0-phase1-return-same")
env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

View File

@@ -0,0 +1,84 @@
from pathlib import Path
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
env = neon_simple_env
cache_dir = Path(env.repo_dir) / "file_cache"
cache_dir.mkdir(exist_ok=True)
branchname = "test_explain_with_lfc_stats"
env.neon_cli.create_branch(branchname, "empty")
log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
endpoint = env.endpoints.create_start(
branchname,
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='128MB'",
"neon.file_cache_size_limit='64MB'",
],
)
cur = endpoint.connect().cursor()
log.info(f"preparing some data in {endpoint.connstr()}")
ddl = """
CREATE TABLE pgbench_accounts (
aid bigint NOT NULL,
bid integer,
abalance integer,
filler character(84),
-- more web-app like columns
text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5),
jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
)
WITH (fillfactor='100');
"""
cur.execute(ddl)
cur.execute(
"insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
)
log.info(f"warming up caches with sequential scan in {endpoint.connstr()}")
cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0")
log.info("running explain analyze without LFC values to verify they do not show up in the plan")
cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0")
rows = cur.fetchall()
plan = "\n".join(r[0] for r in rows)
log.debug(plan)
assert "Seq Scan on pgbench_accounts" in plan
assert "Buffers: shared hit" in plan
assert "File cache: hits=" not in plan
log.info("running explain analyze WITH LFC values to verify they do now show up")
cur.execute(
"EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0"
)
rows = cur.fetchall()
plan = "\n".join(r[0] for r in rows)
log.debug(plan)
assert "Seq Scan on pgbench_accounts" in plan
assert "Buffers: shared hit" in plan
assert "File cache: hits=" in plan
log.info("running explain analyze WITH LFC values to verify json output")
cur.execute(
"EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0"
)
jsonplan = cur.fetchall()[0][0]
log.debug(jsonplan)
# Directly access the 'Plan' part of the first element of the JSON array
plan_details = jsonplan[0]["Plan"]
# Extract "File Cache Hits" and "File Cache Misses"
file_cache_hits = plan_details.get("File Cache Hits")
file_cache_misses = plan_details.get("File Cache Misses")
# Now you can assert the values
assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}"
assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}"

View File

@@ -184,10 +184,13 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
# NB: the layer file is unlinked index part now, but, because we made the delete
# operation stuck, the layer file itself is still in the remote_storage
def delete_at_pause_point():
assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
wait_until(10, 0.5, delete_at_pause_point)
wait_until(
10,
0.5,
lambda: env.pageserver.assert_log_contains(
f".*{tenant_id}.*at failpoint.*{failpoint_name}"
),
)
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
)

View File

@@ -0,0 +1,74 @@
from pathlib import Path
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import query_scalar
def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
env = neon_simple_env
cache_dir = Path(env.repo_dir) / "file_cache"
cache_dir.mkdir(exist_ok=True)
branchname = "test_approximate_working_set_size"
env.neon_cli.create_branch(branchname, "empty")
log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
endpoint = env.endpoints.create_start(
branchname,
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='128MB'",
"neon.file_cache_size_limit='64MB'",
],
)
cur = endpoint.connect().cursor()
cur.execute("create extension neon")
log.info(f"preparing some data in {endpoint.connstr()}")
ddl = """
CREATE TABLE pgbench_accounts (
aid bigint NOT NULL,
bid integer,
abalance integer,
filler character(84),
-- more web-app like columns
text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5),
jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
)
WITH (fillfactor='100');
"""
cur.execute(ddl)
# prepare index access below
cur.execute(
"ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)"
)
cur.execute(
"insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
)
# ensure correct query plans and stats
cur.execute("vacuum ANALYZE pgbench_accounts")
# determine table size - working set should approximate table size after sequential scan
pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'")
log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero")
cur.execute("select approximate_working_set_size(true)")
cur.execute(
'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb'
)
# verify working set size after sequential scan matches table size and reset working set for next test
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
log.info(f"working set size after sequential scan on pgbench_accounts {blocks}")
assert pages * 0.8 < blocks < pages * 1.2
# run a few point queries with index lookup
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 4242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 54242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
# verify working set size after some index access of a few select pages only
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
log.info(f"working set size after some index access of a few select pages only {blocks}")
assert blocks < 10

View File

@@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
def assert_logged():
if not log_expected:
return
assert env.pageserver.log_contains(f".*{msg_id}.*")
env.pageserver.assert_log_contains(f".*{msg_id}.*")
wait_until(10, 0.5, assert_logged)

View File

@@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
# IMPORTANT:
# If the version has changed, the test should be updated.
# Ensure that the default version is also updated in the neon.control file
assert cur.fetchone() == ("1.2",)
assert cur.fetchone() == ("1.3",)
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
res = cur.fetchall()
log.info(res)

View File

@@ -1,9 +1,12 @@
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.pg_version import PgVersion
from fixtures.pg_version import PgVersion, skip_on_postgres
from fixtures.utils import wait_until
@skip_on_postgres(
PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
)
def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
env = neon_simple_env
env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")

View File

@@ -432,7 +432,7 @@ def test_deletion_queue_recovery(
main_pageserver.start()
def assert_deletions_submitted(n: int):
def assert_deletions_submitted(n: int) -> None:
assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
# After restart, issue a flush to kick the deletion frontend to do recovery.

View File

@@ -0,0 +1,110 @@
import asyncio
import time
from typing import Tuple
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
tenant_get_shards,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import wait_for_last_record_lsn
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
TIMELINE_COUNT = 10
ENTRIES_PER_TIMELINE = 10_000
CHECKPOINT_TIMEOUT_SECONDS = 60
TENANT_CONF = {
# Large `checkpoint_distance` effectively disables size
# based checkpointing.
"checkpoint_distance": f"{2 * 1024 ** 3}",
"checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
}
async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
with env.endpoints.create_start("main", tenant_id=tenant) as ep:
conn = await ep.connect_async()
try:
await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
await conn.execute(
f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
)
finally:
await conn.close(timeout=10)
last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
return tenant, timeline, last_flush_lsn
async def workload(
env: NeonEnv, timelines: int, entries: int
) -> list[Tuple[TenantId, TimelineId, Lsn]]:
workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
return await asyncio.gather(*workers)
def wait_until_pageserver_is_caught_up(
env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
):
for tenant, timeline, last_flush_lsn in last_flush_lsns:
shards = tenant_get_shards(env, tenant)
for tenant_shard_id, pageserver in shards:
waited = wait_for_last_record_lsn(
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
)
assert waited >= last_flush_lsn
def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
def query():
value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
assert value is not None
return value
# The metric gets initialised on the first update.
# Retry a few times, but return 0 if it's stable.
try:
return float(wait_until(3, 0.5, query))
except Exception:
return 0
@pytest.mark.parametrize("immediate_shutdown", [True, False])
def test_pageserver_small_inmemory_layers(
neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
):
"""
Test that open layers get flushed after the `checkpoint_timeout` config
and do not require WAL reingest upon restart.
The workload creates a number of timelines and writes some data to each,
but not enough to trigger flushes via the `checkpoint_distance` config.
"""
env = neon_env_builder.init_configs()
env.start()
last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
ps_http_client = env.pageserver.http_client()
total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
log.info("Sleeping for checkpoint timeout ...")
time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
env.pageserver.restart(immediate=immediate_shutdown)
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
leeway = total_wal_ingested_before_restart * 5 / 100
assert total_wal_ingested_after_restart <= leeway

View File

@@ -28,7 +28,14 @@ from fixtures.remote_storage import (
available_remote_storages,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import print_gc_result, query_scalar, wait_until
from fixtures.utils import (
assert_eq,
assert_ge,
assert_gt,
print_gc_result,
query_scalar,
wait_until,
)
from requests import ReadTimeout
@@ -120,10 +127,10 @@ def test_remote_storage_backup_and_restore(
log.info(f"upload of checkpoint {checkpoint_number} is done")
# Check that we had to retry the uploads
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
".*failed to perform remote task UploadLayer.*, will retry.*"
)
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
".*failed to perform remote task UploadMetadata.*, will retry.*"
)
@@ -292,9 +299,9 @@ def test_remote_storage_upload_queue_retries(
print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
# let all future operations queue up
configure_storage_sync_failpoints("return")
@@ -322,17 +329,17 @@ def test_remote_storage_upload_queue_retries(
churn_while_failpoints_active_thread.start()
# wait for churn thread's data to get stuck in the upload queue
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)
wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
# unblock churn operations
configure_storage_sync_failpoints("off")
# ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
# The churn thread doesn't make progress once it blocks on the first wait_completion() call,
# so, give it some time to wrap up.
@@ -884,26 +891,23 @@ def wait_upload_queue_empty(
wait_until(
2,
1,
lambda: get_queued_count(
client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
)
== 0,
lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
),
)
wait_until(
2,
1,
lambda: get_queued_count(
client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
)
== 0,
lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
),
)
wait_until(
2,
1,
lambda: get_queued_count(
client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
)
== 0,
lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
),
)

View File

@@ -116,7 +116,7 @@ def test_sharding_service_smoke(
# Marking a pageserver offline should migrate tenants away from it.
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
def node_evacuated(node_id: int):
def node_evacuated(node_id: int) -> None:
counts = get_node_shard_counts(env, tenant_ids)
assert counts[node_id] == 0
@@ -146,6 +146,8 @@ def test_sharding_service_smoke(
for tid in tenant_ids:
tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
env.attachment_service.consistency_check()
# Set a scheduling policy on one node, create all the tenants, observe
# that the scheduling policy is respected.
env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
@@ -256,9 +258,8 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
env.attachment_service.consistency_check()
def test_sharding_service_onboarding(
neon_env_builder: NeonEnvBuilder,
):
@pytest.mark.parametrize("warm_up", [True, False])
def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
"""
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
which provides the /location_config API. This is similar to creating a tenant,
@@ -306,6 +307,23 @@ def test_sharding_service_onboarding(
},
)
if warm_up:
origin_ps.http_client().tenant_heatmap_upload(tenant_id)
# We expect to be called via live migration code, which may try to configure the tenant into secondary
# mode before attaching it.
virtual_ps_http.tenant_location_conf(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
"generation": None,
},
)
virtual_ps_http.tenant_secondary_download(tenant_id)
# Call into attachment service to onboard the tenant
generation += 1
virtual_ps_http.tenant_location_conf(
@@ -351,7 +369,9 @@ def test_sharding_service_onboarding(
assert len(dest_tenants) == 1
assert TenantId(dest_tenants[0]["id"]) == tenant_id
# sharding service advances generation by 1 when it first attaches
# sharding service advances generation by 1 when it first attaches. We started
# with a nonzero generation so this equality also proves that the generation
# was properly carried over during onboarding.
assert dest_tenants[0]["generation"] == generation + 1
# The onboarded tenant should survive a restart of sharding service
@@ -362,6 +382,31 @@ def test_sharding_service_onboarding(
dest_ps.stop()
dest_ps.start()
# Having onboarded via /location_config, we should also be able to update the
# TenantConf part of LocationConf, without inadvertently resetting the generation
modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
# The generation has moved on since we onboarded
assert generation != dest_tenant_before_conf_change["generation"]
virtual_ps_http.tenant_location_conf(
tenant_id,
{
"mode": "AttachedSingle",
"secondary_conf": None,
"tenant_conf": modified_tenant_conf,
# This is intentionally a stale generation
"generation": generation,
},
)
dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
assert (
dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
)
dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
env.attachment_service.consistency_check()
@@ -405,7 +450,7 @@ def test_sharding_service_compute_hook(
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
def node_evacuated(node_id: int):
def node_evacuated(node_id: int) -> None:
counts = get_node_shard_counts(env, [env.initial_tenant])
assert counts[node_id] == 0
@@ -667,3 +712,41 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
svc.request(
"POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
)
def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
"""
Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
supplying the whole LocationConf.
"""
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
http = env.attachment_service.pageserver_api()
default_value = "7days"
new_value = "1h"
http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
# Ensure the change landed on the storage controller
readback_controller = http.tenant_config(tenant_id)
assert readback_controller.effective_config["pitr_interval"] == new_value
assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
# Ensure the change made it down to the pageserver
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
assert readback_ps.effective_config["pitr_interval"] == new_value
assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
# Omitting a value clears it. This looks different in storage controller
# vs. pageserver API calls, because pageserver has defaults.
http.set_tenant_config(tenant_id, {})
readback_controller = http.tenant_config(tenant_id)
assert readback_controller.effective_config["pitr_interval"] is None
assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
assert readback_ps.effective_config["pitr_interval"] == default_value
assert "pitr_interval" not in readback_ps.tenant_specific_overrides
env.attachment_service.consistency_check()

View File

@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
"period": "20s",
"threshold": "23h",
}
assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
# restart the pageserver and ensure that the config is still correct
env.pageserver.stop()

View File

@@ -505,10 +505,10 @@ def test_tenant_delete_concurrent(
return ps_http.tenant_delete(tenant_id)
def hit_remove_failpoint():
assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
def hit_run_failpoint():
assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
with concurrent.futures.ThreadPoolExecutor() as executor:
background_200_req = executor.submit(delete_tenant)
@@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation(
Thread(target=timeline_create).start()
def hit_initdb_upload_failpoint():
assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
wait_until(100, 0.1, hit_initdb_upload_failpoint)
def creation_connection_timed_out():
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
"POST.*/timeline.* request was dropped before completing"
)
@@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation(
Thread(target=tenant_delete).start()
def deletion_arrived():
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
)
@@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation(
)
# Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
assert env.pageserver.log_contains(CANCELLED_ERROR)
env.pageserver.assert_log_contains(CANCELLED_ERROR)
assert not env.pageserver.log_contains(
".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
)

View File

@@ -92,10 +92,10 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
wait_for_upload(pageserver_http, tenant_id, timeline_id, current_lsn)
# Check that we had to retry the uploads
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
".*failed to perform remote task UploadLayer.*, will retry.*"
)
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
".*failed to perform remote task UploadMetadata.*, will retry.*"
)

View File

@@ -495,7 +495,7 @@ def test_emergency_relocate_with_branches_slow_replay(
assert cur.fetchall() == [("before pause",), ("after pause",)]
# Sanity check that the failpoint was reached
assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
assert time.time() - before_attach_time > 5
# Clean up
@@ -632,7 +632,7 @@ def test_emergency_relocate_with_branches_createdb(
assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
# Sanity check that the failpoint was reached
assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
assert time.time() - before_attach_time > 5
# Clean up

View File

@@ -147,10 +147,10 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
log.info(f"upload of checkpoint {checkpoint_number} is done")
# Check that we had to retry the uploads
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
".*failed to perform remote task UploadLayer.*, will retry.*"
)
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
".*failed to perform remote task UploadMetadata.*, will retry.*"
)

View File

@@ -179,6 +179,6 @@ def test_threshold_based_eviction(
assert len(post.remote_layers) > 0, "some layers should be evicted once it's stabilized"
assert len(post.local_layers) > 0, "the imitate accesses should keep some layers resident"
assert env.pageserver.log_contains(
metrics_refused_log_line
assert (
env.pageserver.log_contains(metrics_refused_log_line) is not None
), "ensure the metrics collection worker ran"

View File

@@ -89,6 +89,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
assert timeline_path.exists()
# retry deletes when compaction or gc is running in pageserver
# TODO: review whether this wait_until is actually necessary, we do an await() internally
wait_until(
number_of_iterations=3,
interval=0.2,
@@ -531,7 +532,7 @@ def test_concurrent_timeline_delete_stuck_on(
try:
def first_call_hit_failpoint():
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
)
@@ -602,7 +603,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
def hit_failpoint():
assert env.pageserver.log_contains(at_failpoint_log_message)
env.pageserver.assert_log_contains(at_failpoint_log_message)
wait_until(50, 0.1, hit_failpoint)
@@ -612,7 +613,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
env.pageserver.allowed_errors.append(hangup_log_message)
def got_hangup_log_message():
assert env.pageserver.log_contains(hangup_log_message)
env.pageserver.assert_log_contains(hangup_log_message)
wait_until(50, 0.1, got_hangup_log_message)
@@ -624,7 +625,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
def first_request_finished():
message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
assert env.pageserver.log_contains(message)
env.pageserver.assert_log_contains(message)
wait_until(50, 0.1, first_request_finished)
@@ -759,7 +760,7 @@ def test_delete_orphaned_objects(
for orphan in orphans:
assert not orphan.exists()
assert env.pageserver.log_contains(
env.pageserver.assert_log_contains(
f"deleting a file not referenced from index_part.json name={orphan.stem}"
)

View File

@@ -2,6 +2,7 @@ import concurrent.futures
import math
import random
import time
from collections import defaultdict
from contextlib import closing
from pathlib import Path
from typing import Optional
@@ -14,6 +15,7 @@ from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
NeonEnvBuilder,
NeonPageserver,
PgBin,
VanillaPostgres,
wait_for_last_flush_lsn,
@@ -839,22 +841,40 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
)
# Deleting a stuck tenant should prompt it to go active
# in some cases, it has already been activated because it's behind the detach
delete_lazy_activating(delete_tenant_id, env.pageserver, expect_attaching=False)
tenant_ids.remove(delete_tenant_id)
# Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
# we detached)
wait_until(10, 1, all_active)
assert len(get_tenant_states()) == n_tenants - 2
def delete_lazy_activating(
delete_tenant_id: TenantId, pageserver: NeonPageserver, expect_attaching: bool
):
pageserver_http = pageserver.http_client()
# Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
# logical size is paused in a failpoint. So instead we will use a log observation to check that
# on-demand activation was triggered by the tenant deletion
log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
if expect_attaching:
assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"
with concurrent.futures.ThreadPoolExecutor() as executor:
log.info("Starting background delete")
def activated_on_demand():
assert pageserver.log_contains(log_match) is not None
def delete_tenant():
env.pageserver.http_client().tenant_delete(delete_tenant_id)
pageserver_http.tenant_delete(delete_tenant_id)
background_delete = executor.submit(delete_tenant)
# Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
# logical size is paused in a failpoint. So instead we will use a log observation to check that
# on-demand activation was triggered by the tenant deletion
log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
def activated_on_demand():
assert env.pageserver.log_contains(log_match) is not None
log.info(f"Waiting for activation message '{log_match}'")
try:
wait_until(10, 1, activated_on_demand)
@@ -868,12 +888,6 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
# Poll for deletion to complete
wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
tenant_ids.remove(delete_tenant_id)
# Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
# we detached)
wait_until(10, 1, all_active)
assert len(get_tenant_states()) == n_tenants - 2
def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
@@ -939,3 +953,159 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
client.configure_failpoints(
[("initial-size-calculation-permit-pause", "off"), ("walreceiver-after-ingest", "off")]
)
def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
env = neon_env_builder.init_start()
# the supporting_second does nothing except queue behind env.initial_tenant
# for purposes of showing that eager_tenant breezes past the queue
supporting_second, _ = env.neon_cli.create_tenant()
eager_tenant, _ = env.neon_cli.create_tenant()
client = env.pageserver.http_client()
client.tenant_location_conf(
eager_tenant,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
"generation": None,
},
)
env.pageserver.stop()
# pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
env.pageserver.start(
extra_env_vars={
"FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
}
)
tenant_ids = [env.initial_tenant, supporting_second]
def get_tenant_states() -> dict[str, list[TenantId]]:
states = defaultdict(list)
for id in tenant_ids:
state = client.tenant_status(id)["state"]["slug"]
states[state].append(id)
return dict(states)
def one_is_active():
states = get_tenant_states()
log.info(f"{states}")
assert len(states["Active"]) == 1
wait_until(10, 1, one_is_active)
def other_is_attaching():
states = get_tenant_states()
assert len(states["Attaching"]) == 1
wait_until(10, 1, other_is_attaching)
def eager_tenant_is_active():
resp = client.tenant_status(eager_tenant)
assert resp["state"]["slug"] == "Active"
gen = env.attachment_service.attach_hook_issue(eager_tenant, env.pageserver.id)
client.tenant_location_conf(
eager_tenant,
{
"mode": "AttachedSingle",
"secondary_conf": None,
"tenant_conf": {},
"generation": gen,
},
lazy=False,
)
wait_until(10, 1, eager_tenant_is_active)
other_is_attaching()
client.configure_failpoints(
[("timeline-calculate-logical-size-pause", "off"), ("walreceiver-after-ingest", "off")]
)
@pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"])
def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str):
# env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart
neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'"
env = neon_env_builder.init_start()
# because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue
lazy_tenant, _ = env.neon_cli.create_tenant()
client = env.pageserver.http_client()
client.tenant_location_conf(
lazy_tenant,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
"generation": None,
},
)
env.pageserver.stop()
# pause at logical size calculation, also pause before walreceiver can give feedback so it will give priority to logical size calculation
env.pageserver.start(
extra_env_vars={
"FAILPOINTS": "timeline-calculate-logical-size-pause=pause;walreceiver-after-ingest=pause"
}
)
def initial_tenant_is_active():
resp = client.tenant_status(env.initial_tenant)
assert resp["state"]["slug"] == "Active"
wait_until(10, 1, initial_tenant_is_active)
# even though the initial tenant is now active, because it was startup time
# attach, it will consume the only permit because logical size calculation
# is paused.
gen = env.attachment_service.attach_hook_issue(lazy_tenant, env.pageserver.id)
client.tenant_location_conf(
lazy_tenant,
{
"mode": "AttachedSingle",
"secondary_conf": None,
"tenant_conf": {},
"generation": gen,
},
lazy=True,
)
def lazy_tenant_is_attaching():
resp = client.tenant_status(lazy_tenant)
assert resp["state"]["slug"] == "Attaching"
# paused logical size calculation of env.initial_tenant is keeping it attaching
wait_until(10, 1, lazy_tenant_is_attaching)
for _ in range(5):
lazy_tenant_is_attaching()
time.sleep(0.5)
def lazy_tenant_is_active():
resp = client.tenant_status(lazy_tenant)
assert resp["state"]["slug"] == "Active"
if activation_method == "endpoint":
with env.endpoints.create_start("main", tenant_id=lazy_tenant):
# starting up the endpoint should make it jump the queue
wait_until(10, 1, lazy_tenant_is_active)
elif activation_method == "branch":
env.neon_cli.create_timeline("second_branch", lazy_tenant)
wait_until(10, 1, lazy_tenant_is_active)
elif activation_method == "delete":
delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
else:
raise RuntimeError(activation_method)