Compare commits

..

6 Commits

Author SHA1 Message Date
Arthur Petukhovsky
1df0f69664 Merge pull request #6973 from neondatabase/rc/2024-02-29-manual
Release 2024-02-29
2024-02-29 17:26:33 +00:00
Vlad Lazar
970066a914 libs: fix expired token in auth decode test (#6963)
The test token expired earlier today (1709200879). I regenerated the
token, but without an expiration date this time.
2024-02-29 17:23:25 +00:00
Arthur Petukhovsky
1ebd3897c0 Merge pull request #6956 from neondatabase/rc/2024-02-28
Release 2024-02-28
2024-02-29 16:39:52 +00:00
Arthur Petukhovsky
6460beffcd Merge pull request #6901 from neondatabase/rc/2024-02-26
Release 2024-02-26
2024-02-26 17:08:19 +00:00
John Spray
6f7f8958db pageserver: only write out legacy tenant config if no generation (#6891)
## Problem

Previously we always wrote out both legacy and modern tenant config
files. The legacy write enabled rollbacks, but we are long past the
point where that is needed.

We still need the legacy format for situations where someone is running
tenants without generations (that will be yanked as well eventually),
but we can avoid writing it out at all if we do have a generation number
set. We implicitly also avoid writing the legacy config if our mode is
Secondary (secondary mode is newer than generations).

## Summary of changes

- Make writing legacy tenant config conditional on there being no
generation number set.
2024-02-26 10:25:25 +00:00
Christian Schwarz
936a00e077 pageserver: remove two obsolete/unused per-timeline metrics (#6893)
over-compensating the addition of a new per-timeline metric in
https://github.com/neondatabase/neon/pull/6834

part of https://github.com/neondatabase/neon/issues/6737
2024-02-26 09:16:24 +00:00
106 changed files with 745 additions and 2312 deletions

8
Cargo.lock generated
View File

@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "ahash"
version = "0.8.9"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
dependencies = [
"cfg-if",
"const-random",
@@ -1389,9 +1389,9 @@ dependencies = [
[[package]]
name = "crc32c"
version = "0.6.5"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
dependencies = [
"rustc_version",
]

View File

@@ -230,8 +230,6 @@ postgres=# select * from t;
> cargo neon stop
```
More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
#### Handling build failures
If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.

View File

@@ -18,6 +18,8 @@ use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::StreamExt;
use postgres::{Client, NoTls};
use tokio;
use tokio_postgres;
use tracing::{debug, error, info, instrument, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;

View File

@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
}
}
*/
use anyhow::Result;
use anyhow::{self, Result};
use anyhow::{bail, Context};
use bytes::Bytes;
use compute_api::spec::RemoteExtSpec;

View File

@@ -13,6 +13,8 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use num_cpus;
use serde_json;
use tokio::task;
use tracing::{error, info, warn};
use tracing_utils::http::OtelName;

View File

@@ -1,26 +0,0 @@
# Control Plane and Neon Local
This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
## Example: Start with Postgres 16
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
```shell
cargo neon init --pg-version 16
cargo neon start
cargo neon tenant create --set-default --pg-version 16
cargo neon endpoint create main --pg-version 16
cargo neon endpoint start main
```
## Example: Create Test User and Database
By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
```shell
cargo neon endpoint create main --pg-version 16 --update-catalog true
cargo neon endpoint start main --create-test-user true
```
The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.

View File

@@ -1,2 +0,0 @@
ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;

View File

@@ -1,4 +0,0 @@
ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;

View File

@@ -1,10 +1,9 @@
use crate::reconciler::ReconcileError;
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
use crate::PlacementPolicy;
use hyper::{Body, Request, Response};
use hyper::{StatusCode, Uri};
use pageserver_api::models::{
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
TenantTimeTravelRequest, TimelineCreateRequest,
};
use pageserver_api::shard::TenantShardId;
@@ -118,14 +117,9 @@ async fn handle_tenant_create(
check_permissions(&req, Scope::PageServerApi)?;
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
// have no expectation of HA).
let placement_policy = PlacementPolicy::Single;
json_response(
StatusCode::CREATED,
service.tenant_create(create_req, placement_policy).await?,
service.tenant_create(create_req).await?,
)
}
@@ -191,27 +185,6 @@ async fn handle_tenant_location_config(
)
}
async fn handle_tenant_config_set(
service: Arc<Service>,
mut req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::PageServerApi)?;
let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
}
async fn handle_tenant_config_get(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
check_permissions(&req, Scope::PageServerApi)?;
json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
}
async fn handle_tenant_time_travel_remote_storage(
service: Arc<Service>,
mut req: Request<Body>,
@@ -243,15 +216,7 @@ async fn handle_tenant_time_travel_remote_storage(
done_if_after_raw,
)
.await?;
json_response(StatusCode::OK, ())
}
async fn handle_tenant_secondary_download(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
service.tenant_secondary_download(tenant_id).await?;
json_response(StatusCode::OK, ())
}
@@ -586,21 +551,12 @@ pub fn make_router(
.delete("/v1/tenant/:tenant_id", |r| {
tenant_service_handler(r, handle_tenant_delete)
})
.put("/v1/tenant/config", |r| {
tenant_service_handler(r, handle_tenant_config_set)
})
.get("/v1/tenant/:tenant_id/config", |r| {
tenant_service_handler(r, handle_tenant_config_get)
})
.put("/v1/tenant/:tenant_id/location_config", |r| {
tenant_service_handler(r, handle_tenant_location_config)
})
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
})
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
tenant_service_handler(r, handle_tenant_secondary_download)
})
// Timeline operations
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
tenant_service_handler(r, handle_tenant_timeline_delete)

View File

@@ -13,20 +13,14 @@ mod schema;
pub mod service;
mod tenant_state;
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
#[derive(Clone, Serialize, Deserialize, Debug)]
enum PlacementPolicy {
/// Cheapest way to attach a tenant: just one pageserver, no secondary
Single,
/// Production-ready way to attach a tenant: one attached pageserver and
/// some number of secondaries.
Double(usize),
/// Create one secondary mode locations. This is useful when onboarding
/// a tenant, or for an idle tenant that we might want to bring online quickly.
Secondary,
/// Do not attach to any pageservers. This is appropriate for tenants that
/// have been idle for a long time, where we do not mind some delay in making
/// them available in future.
/// Do not attach to any pageservers
Detached,
}

View File

@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
use attachment_service::metrics::preinitialize_metrics;
use attachment_service::persistence::Persistence;
use attachment_service::service::{Config, Service};
use aws_config::{BehaviorVersion, Region};
use aws_config::{self, BehaviorVersion, Region};
use camino::Utf8PathBuf;
use clap::Parser;
use diesel::Connection;
@@ -79,38 +79,13 @@ impl Secrets {
"neon-storage-controller-control-plane-jwt-token";
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
/// Load secrets from, in order of preference:
/// - CLI args if database URL is provided on the CLI
/// - Environment variables if DATABASE_URL is set.
/// - AWS Secrets Manager secrets
async fn load(args: &Cli) -> anyhow::Result<Self> {
match &args.database_url {
Some(url) => Self::load_cli(url, args),
None => match std::env::var(Self::DATABASE_URL_ENV) {
Ok(database_url) => Self::load_env(database_url),
Err(_) => Self::load_aws_sm().await,
},
None => Self::load_aws_sm().await,
}
}
fn load_env(database_url: String) -> anyhow::Result<Self> {
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
Err(_) => None,
};
Ok(Self {
database_url,
public_key,
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
})
}
async fn load_aws_sm() -> anyhow::Result<Self> {
let Ok(region) = std::env::var("AWS_REGION") else {
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");

View File

@@ -7,10 +7,8 @@ use self::split_state::SplitState;
use camino::Utf8Path;
use camino::Utf8PathBuf;
use diesel::pg::PgConnection;
use diesel::{
Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
Selectable, SelectableHelper,
};
use diesel::prelude::*;
use diesel::Connection;
use pageserver_api::controller_api::NodeSchedulingPolicy;
use pageserver_api::models::TenantConfig;
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
@@ -333,15 +331,7 @@ impl Persistence {
shard_number: ShardNumber(tsp.shard_number as u8),
shard_count: ShardCount::new(tsp.shard_count as u8),
};
let Some(g) = tsp.generation else {
// If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
// we only set generation_pageserver when setting generation.
return Err(DatabaseError::Logical(
"Generation should always be set after incrementing".to_string(),
));
};
result.insert(tenant_shard_id, Generation::new(g as u32));
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
}
Ok(result)
@@ -374,85 +364,7 @@ impl Persistence {
})
.await?;
// Generation is always non-null in the rseult: if the generation column had been NULL, then we
// should have experienced an SQL Confilict error while executing a query that tries to increment it.
debug_assert!(updated.generation.is_some());
let Some(g) = updated.generation else {
return Err(DatabaseError::Logical(
"Generation should always be set after incrementing".to_string(),
)
.into());
};
Ok(Generation::new(g as u32))
}
/// For use when updating a persistent property of a tenant, such as its config or placement_policy.
///
/// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
/// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing
/// that we only do the first time a tenant is set to an attached policy via /location_config.
pub(crate) async fn update_tenant_shard(
&self,
tenant_shard_id: TenantShardId,
input_placement_policy: PlacementPolicy,
input_config: TenantConfig,
input_generation: Option<Generation>,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
let query = diesel::update(tenant_shards)
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
if let Some(input_generation) = input_generation {
// Update includes generation column
query
.set((
generation.eq(Some(input_generation.into().unwrap() as i32)),
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
} else {
// Update does not include generation column
query
.set((
placement_policy
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
config.eq(serde_json::to_string(&input_config).unwrap()),
))
.execute(conn)?;
}
Ok(())
})
.await?;
Ok(())
}
pub(crate) async fn update_tenant_config(
&self,
input_tenant_id: TenantId,
input_config: TenantConfig,
) -> DatabaseResult<()> {
use crate::schema::tenant_shards::dsl::*;
self.with_conn(move |conn| {
diesel::update(tenant_shards)
.filter(tenant_id.eq(input_tenant_id.to_string()))
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
.execute(conn)?;
Ok(())
})
.await?;
Ok(())
Ok(Generation::new(updated.generation as u32))
}
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
@@ -463,7 +375,7 @@ impl Persistence {
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
.set((
generation_pageserver.eq(Option::<i64>::None),
generation_pageserver.eq(i64::MAX),
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
))
.execute(conn)?;
@@ -589,15 +501,12 @@ pub(crate) struct TenantShardPersistence {
pub(crate) shard_stripe_size: i32,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching.
//
// Generation is only None when first onboarding a tenant, where it may
// be in PlacementPolicy::Secondary and therefore have no valid generation state.
pub(crate) generation: Option<i32>,
// and use the incremented number when attaching
pub(crate) generation: i32,
// Currently attached pageserver
#[serde(rename = "pageserver")]
pub(crate) generation_pageserver: Option<i64>,
pub(crate) generation_pageserver: i64,
#[serde(default)]
pub(crate) placement_policy: String,

View File

@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
/// of a tenant's state from when we spawned a reconcile task.
pub(super) tenant_shard_id: TenantShardId,
pub(crate) shard: ShardIdentity,
pub(crate) generation: Option<Generation>,
pub(crate) generation: Generation,
pub(crate) intent: TargetState,
pub(crate) config: TenantConfig,
pub(crate) observed: ObservedState,
@@ -312,7 +312,7 @@ impl Reconciler {
&self.shard,
&self.config,
LocationConfigMode::AttachedStale,
self.generation,
Some(self.generation),
None,
);
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
@@ -335,17 +335,16 @@ impl Reconciler {
}
// Increment generation before attaching to new pageserver
self.generation = Some(
self.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.await?,
);
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, dest_ps_id)
.await?;
let dest_conf = build_location_config(
&self.shard,
&self.config,
LocationConfigMode::AttachedMulti,
self.generation,
Some(self.generation),
None,
);
@@ -402,7 +401,7 @@ impl Reconciler {
&self.shard,
&self.config,
LocationConfigMode::AttachedSingle,
self.generation,
Some(self.generation),
None,
);
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
@@ -434,62 +433,22 @@ impl Reconciler {
// If the attached pageserver is not attached, do so now.
if let Some(node_id) = self.intent.attached {
// If we are in an attached policy, then generation must have been set (null generations
// are only present when a tenant is initially loaded with a secondary policy)
debug_assert!(self.generation.is_some());
let Some(generation) = self.generation else {
return Err(ReconcileError::Other(anyhow::anyhow!(
"Attempted to attach with NULL generation"
)));
};
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
let mut wanted_conf =
attached_location_conf(self.generation, &self.shard, &self.config);
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
// Nothing to do
tracing::info!(%node_id, "Observed configuration already correct.")
}
observed => {
_ => {
// In all cases other than a matching observed configuration, we will
// reconcile this location. This includes locations with different configurations, as well
// as locations with unknown (None) observed state.
// The general case is to increment the generation. However, there are cases
// where this is not necessary:
// - if we are only updating the TenantConf part of the location
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
// and the location was already in the correct generation
let increment_generation = match observed {
None => true,
Some(ObservedStateLocation { conf: None }) => true,
Some(ObservedStateLocation {
conf: Some(observed),
}) => {
let generations_match = observed.generation == wanted_conf.generation;
use LocationConfigMode::*;
let mode_transition_requires_gen_inc =
match (observed.mode, wanted_conf.mode) {
// Usually the short-lived attachment modes (multi and stale) are only used
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
(AttachedSingle, AttachedStale) => false,
(AttachedMulti, AttachedSingle) => false,
(lhs, rhs) => lhs != rhs,
};
!generations_match || mode_transition_requires_gen_inc
}
};
if increment_generation {
let generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.await?;
self.generation = Some(generation);
wanted_conf.generation = generation.into();
}
self.generation = self
.persistence
.increment_generation(self.tenant_shard_id, node_id)
.await?;
wanted_conf.generation = self.generation.into();
tracing::info!(%node_id, "Observed configuration requires update.");
self.location_config(node_id, wanted_conf, None).await?;
self.compute_notify().await?;

View File

@@ -284,6 +284,7 @@ pub(crate) mod test_utils {
#[cfg(test)]
mod tests {
use super::*;
use utils::id::NodeId;
use crate::tenant_state::IntentState;
#[test]

View File

@@ -17,8 +17,8 @@ diesel::table! {
shard_number -> Int4,
shard_count -> Int4,
shard_stripe_size -> Int4,
generation -> Nullable<Int4>,
generation_pageserver -> Nullable<Int8>,
generation -> Int4,
generation_pageserver -> Int8,
placement_policy -> Varchar,
splitting -> Int2,
config -> Text,

View File

@@ -14,13 +14,10 @@ use control_plane::attachment_service::{
use diesel::result::DatabaseErrorKind;
use futures::{stream::FuturesUnordered, StreamExt};
use hyper::StatusCode;
use pageserver_api::{
controller_api::{
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
},
models::TenantConfigRequest,
use pageserver_api::controller_api::{
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use pageserver_api::{
models::{
@@ -68,11 +65,6 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
// some data in it.
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
// If we receive a call using Secondary mode initially, it will omit generation. We will initialize
// tenant shards into this generation, and as long as it remains in this generation, we will accept
// input generation from future requests as authoritative.
const INITIAL_GENERATION: Generation = Generation::new(0);
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
/// up on unresponsive pageservers and proceed.
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
@@ -175,21 +167,6 @@ impl From<ReconcileWaitError> for ApiError {
}
}
#[allow(clippy::large_enum_variant)]
enum TenantCreateOrUpdate {
Create((TenantCreateRequest, PlacementPolicy)),
Update(Vec<ShardUpdate>),
}
struct ShardUpdate {
tenant_shard_id: TenantShardId,
placement_policy: PlacementPolicy,
tenant_config: TenantConfig,
/// If this is None, generation is not updated.
generation: Option<Generation>,
}
impl Service {
pub fn get_config(&self) -> &Config {
&self.config
@@ -594,9 +571,6 @@ impl Service {
// the shard so that a future [`TenantState::maybe_reconcile`] will try again.
tenant.pending_compute_notification = result.pending_compute_notification;
// Let the TenantState know it is idle.
tenant.reconcile_complete(result.sequence);
match result.result {
Ok(()) => {
for (node_id, loc) in &result.observed.locations {
@@ -687,8 +661,8 @@ impl Service {
// after when pageservers start up and register.
let mut node_ids = HashSet::new();
for tsp in &tenant_shard_persistence {
if let Some(node_id) = tsp.generation_pageserver {
node_ids.insert(node_id);
if tsp.generation_pageserver != i64::MAX {
node_ids.insert(tsp.generation_pageserver);
}
}
for node_id in node_ids {
@@ -725,15 +699,18 @@ impl Service {
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
// it with what we can infer: the node for which a generation was most recently issued.
let mut intent = IntentState::new();
if let Some(generation_pageserver) = tsp.generation_pageserver {
intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
if tsp.generation_pageserver != i64::MAX {
intent.set_attached(
&mut scheduler,
Some(NodeId(tsp.generation_pageserver as u64)),
);
}
let new_tenant = TenantState {
tenant_shard_id,
shard: shard_identity,
sequence: Sequence::initial(),
generation: tsp.generation.map(|g| Generation::new(g as u32)),
generation: Generation::new(tsp.generation as u32),
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
intent,
observed: ObservedState::new(),
@@ -813,8 +790,8 @@ impl Service {
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: 0,
generation: Some(0),
generation_pageserver: None,
generation: 0,
generation_pageserver: i64::MAX,
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
splitting: SplitState::default(),
@@ -869,7 +846,7 @@ impl Service {
.expect("Checked for existence above");
if let Some(new_generation) = new_generation {
tenant_state.generation = Some(new_generation);
tenant_state.generation = new_generation;
} else {
// This is a detach notification. We must update placement policy to avoid re-attaching
// during background scheduling/reconciliation, or during attachment service restart.
@@ -919,7 +896,7 @@ impl Service {
node_id,
ObservedStateLocation {
conf: Some(attached_location_conf(
tenant_state.generation.unwrap(),
tenant_state.generation,
&tenant_state.shard,
&tenant_state.config,
)),
@@ -933,7 +910,7 @@ impl Service {
Ok(AttachHookResponse {
gen: attach_req
.node_id
.map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
.map(|_| tenant_state.generation.into().unwrap()),
})
}
@@ -946,7 +923,7 @@ impl Service {
attachment: tenant_state.and_then(|s| {
s.intent
.get_attached()
.map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
.map(|ps| (s.generation.into().unwrap(), ps))
}),
}
}
@@ -996,17 +973,7 @@ impl Service {
continue;
};
// If [`Persistence::re_attach`] selected this shard, it must have alread
// had a generation set.
debug_assert!(shard_state.generation.is_some());
let Some(old_gen) = shard_state.generation else {
// Should never happen: would only return incremented generation
// for a tenant that already had a non-null generation.
return Err(ApiError::InternalServerError(anyhow::anyhow!(
"Generation must be set while re-attaching"
)));
};
shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
if let Some(observed) = shard_state
.observed
.locations
@@ -1036,7 +1003,7 @@ impl Service {
for req_tenant in validate_req.tenants {
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
let valid = tenant_state.generation == Generation::new(req_tenant.gen);
tracing::info!(
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
req_tenant.id,
@@ -1063,9 +1030,8 @@ impl Service {
pub(crate) async fn tenant_create(
&self,
create_req: TenantCreateRequest,
placement_policy: PlacementPolicy,
) -> Result<TenantCreateResponse, ApiError> {
let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
let (response, waiters) = self.do_tenant_create(create_req).await?;
self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
Ok(response)
@@ -1074,7 +1040,6 @@ impl Service {
pub(crate) async fn do_tenant_create(
&self,
create_req: TenantCreateRequest,
placement_policy: PlacementPolicy,
) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
// This service expects to handle sharding itself: it is an error to try and directly create
// a particular shard here.
@@ -1100,27 +1065,9 @@ impl Service {
})
.collect::<Vec<_>>();
// If the caller specifies a None generation, it means "start from default". This is different
// to [`Self::tenant_location_config`], where a None generation is used to represent
// an incompletely-onboarded tenant.
let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
tracing::info!(
"tenant_create: secondary mode, generation is_some={}",
create_req.generation.is_some()
);
create_req.generation.map(Generation::new)
} else {
tracing::info!(
"tenant_create: not secondary mode, generation is_some={}",
create_req.generation.is_some()
);
Some(
create_req
.generation
.map(Generation::new)
.unwrap_or(INITIAL_GENERATION),
)
};
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
// have no expectation of HA).
let placement_policy: PlacementPolicy = PlacementPolicy::Single;
// Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller
// to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
@@ -1132,10 +1079,8 @@ impl Service {
shard_number: tenant_shard_id.shard_number.0 as i32,
shard_count: tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
generation: initial_generation.map(|g| g.into().unwrap() as i32),
// The pageserver is not known until scheduling happens: we will set this column when
// incrementing the generation the first time we attach to a pageserver.
generation_pageserver: None,
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
generation_pageserver: i64::MAX,
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
config: serde_json::to_string(&create_req.config).unwrap(),
splitting: SplitState::default(),
@@ -1175,17 +1120,15 @@ impl Service {
))
})?;
if let Some(node_id) = entry.get().intent.get_attached() {
let generation = entry
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: entry
.get()
.generation
.expect("Generation is set when in attached mode");
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: *node_id,
generation: generation.into().unwrap(),
});
}
.intent
.get_attached()
.expect("We just set pageserver if it was None"),
generation: entry.get().generation.into().unwrap(),
});
continue;
}
@@ -1199,7 +1142,9 @@ impl Service {
placement_policy.clone(),
);
state.generation = initial_generation;
if let Some(create_gen) = create_req.generation {
state.generation = Generation::new(create_gen);
}
state.config = create_req.config.clone();
state.schedule(scheduler).map_err(|e| {
@@ -1208,18 +1153,14 @@ impl Service {
))
})?;
// Only include shards in result if we are attaching: the purpose
// of the response is to tell the caller where the shards are attached.
if let Some(node_id) = state.intent.get_attached() {
let generation = state
.generation
.expect("Generation is set when in attached mode");
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: *node_id,
generation: generation.into().unwrap(),
});
}
response_shards.push(TenantCreateResponseShard {
shard_id: tenant_shard_id,
node_id: state
.intent
.get_attached()
.expect("We just set pageserver if it was None"),
generation: state.generation.into().unwrap(),
});
entry.insert(state)
}
};
@@ -1273,114 +1214,12 @@ impl Service {
Ok(())
}
/// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
/// and transform it into either a tenant creation of a series of shard updates.
fn tenant_location_config_prepare(
&self,
tenant_id: TenantId,
req: TenantLocationConfigRequest,
) -> TenantCreateOrUpdate {
let mut updates = Vec::new();
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, _scheduler) = locked.parts_mut();
// Use location config mode as an indicator of policy.
let placement_policy = match req.config.mode {
LocationConfigMode::Detached => PlacementPolicy::Detached,
LocationConfigMode::Secondary => PlacementPolicy::Secondary,
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
if nodes.len() > 1 {
PlacementPolicy::Double(1)
} else {
// Convenience for dev/test: if we just have one pageserver, import
// tenants into Single mode so that scheduling will succeed.
PlacementPolicy::Single
}
}
};
let mut create = true;
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Saw an existing shard: this is not a creation
create = false;
// Shards may have initially been created by a Secondary request, where we
// would have left generation as None.
//
// We only update generation the first time we see an attached-mode request,
// and if there is no existing generation set. The caller is responsible for
// ensuring that no non-storage-controller pageserver ever uses a higher
// generation than they passed in here.
use LocationConfigMode::*;
let set_generation = match req.config.mode {
AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
req.config.generation.map(Generation::new)
}
_ => None,
};
if shard.policy != placement_policy
|| shard.config != req.config.tenant_conf
|| set_generation.is_some()
{
updates.push(ShardUpdate {
tenant_shard_id: *shard_id,
placement_policy: placement_policy.clone(),
tenant_config: req.config.tenant_conf.clone(),
generation: set_generation,
});
}
}
if create {
use LocationConfigMode::*;
let generation = match req.config.mode {
AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
// If a caller provided a generation in a non-attached request, ignore it
// and leave our generation as None: this enables a subsequent update to set
// the generation when setting an attached mode for the first time.
_ => None,
};
TenantCreateOrUpdate::Create(
// Synthesize a creation request
(
TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation,
shard_parameters: ShardParameters {
// Must preserve the incoming shard_count do distinguish unsharded (0)
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
count: req.tenant_id.shard_count,
// We only import un-sharded or single-sharded tenants, so stripe
// size can be made up arbitrarily here.
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
},
config: req.config.tenant_conf,
},
placement_policy,
),
)
} else {
TenantCreateOrUpdate::Update(updates)
}
}
/// This API is used by the cloud control plane to migrate unsharded tenants that it created
/// directly with pageservers into this service.
///
/// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
/// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
/// Think of the first attempt to call this API as a transfer of absolute authority over the
/// tenant's source of generation numbers.
///
/// The mode in this request coarse-grained control of tenants:
/// This API is used by the cloud control plane to do coarse-grained control of tenants:
/// - Call with mode Attached* to upsert the tenant.
/// - Call with mode Secondary to either onboard a tenant without attaching it, or
/// to set an existing tenant to PolicyMode::Secondary
/// - Call with mode Detached to switch to PolicyMode::Detached
///
/// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
/// secondary locations.
pub(crate) async fn tenant_location_config(
&self,
tenant_id: TenantId,
@@ -1392,96 +1231,131 @@ impl Service {
)));
}
// First check if this is a creation or an update
let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
let mut waiters = Vec::new();
let mut result = TenantLocationConfigResponse { shards: Vec::new() };
let waiters = match create_or_update {
TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
let (create_resp, waiters) =
self.do_tenant_create(create_req, placement_policy).await?;
result.shards = create_resp
.shards
.into_iter()
.map(|s| TenantShardLocation {
node_id: s.node_id,
shard_id: s.shard_id,
})
.collect();
waiters
}
TenantCreateOrUpdate::Update(updates) => {
// Persist updates
// Ordering: write to the database before applying changes in-memory, so that
// we will not appear time-travel backwards on a restart.
for ShardUpdate {
tenant_shard_id,
placement_policy,
tenant_config,
generation,
} in &updates
{
self.persistence
.update_tenant_shard(
*tenant_shard_id,
placement_policy.clone(),
tenant_config.clone(),
*generation,
)
.await?;
}
let maybe_create = {
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let (nodes, tenants, scheduler) = locked.parts_mut();
// Apply updates in-memory
let mut waiters = Vec::new();
{
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let (nodes, tenants, scheduler) = locked.parts_mut();
// Maybe we have existing shards
let mut create = true;
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
// Saw an existing shard: this is not a creation
create = false;
for ShardUpdate {
tenant_shard_id,
placement_policy,
tenant_config,
generation: update_generation,
} in updates
{
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
tracing::warn!("Shard {tenant_shard_id} removed while updating");
continue;
};
// Note that for existing tenants we do _not_ respect the generation in the request: this is likely
// to be stale. Once a tenant is created in this service, our view of generation is authoritative, and
// callers' generations may be ignored. This represents a one-way migration of tenants from the outer
// cloud control plane into this service.
shard.policy = placement_policy;
shard.config = tenant_config;
if let Some(generation) = update_generation {
shard.generation = Some(generation);
}
shard.schedule(scheduler)?;
let maybe_waiter = shard.maybe_reconcile(
result_tx.clone(),
nodes,
&compute_hook,
&self.config,
&self.persistence,
&self.gate,
&self.cancel,
);
if let Some(waiter) = maybe_waiter {
waiters.push(waiter);
}
if let Some(node_id) = shard.intent.get_attached() {
result.shards.push(TenantShardLocation {
shard_id: tenant_shard_id,
node_id: *node_id,
})
// Use location config mode as an indicator of policy: if they ask for
// attached we go to default HA attached mode. If they ask for secondary
// we go to secondary-only mode. If they ask for detached we detach.
match req.config.mode {
LocationConfigMode::Detached => {
shard.policy = PlacementPolicy::Detached;
}
LocationConfigMode::Secondary => {
// TODO: implement secondary-only mode.
todo!();
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// TODO: persistence for changes in policy
if nodes.len() > 1 {
shard.policy = PlacementPolicy::Double(1)
} else {
// Convenience for dev/test: if we just have one pageserver, import
// tenants into Single mode so that scheduling will succeed.
shard.policy = PlacementPolicy::Single
}
}
}
waiters
shard.schedule(scheduler)?;
let maybe_waiter = shard.maybe_reconcile(
result_tx.clone(),
nodes,
&compute_hook,
&self.config,
&self.persistence,
&self.gate,
&self.cancel,
);
if let Some(waiter) = maybe_waiter {
waiters.push(waiter);
}
if let Some(node_id) = shard.intent.get_attached() {
result.shards.push(TenantShardLocation {
shard_id: *shard_id,
node_id: *node_id,
})
}
}
if create {
// Validate request mode
match req.config.mode {
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
// When using this API to onboard an existing tenant to this service, it must start in
// an attached state, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Imported tenant must be in attached mode"
)));
}
LocationConfigMode::AttachedMulti
| LocationConfigMode::AttachedSingle
| LocationConfigMode::AttachedStale => {
// Pass
}
}
// Validate request generation
let Some(generation) = req.config.generation else {
// We can only import attached tenants, because we need the request to come with a generation
return Err(ApiError::BadRequest(anyhow::anyhow!(
"Generation is mandatory when importing tenant"
)));
};
// Synthesize a creation request
Some(TenantCreateRequest {
new_tenant_id: TenantShardId::unsharded(tenant_id),
generation: Some(generation),
shard_parameters: ShardParameters {
// Must preserve the incoming shard_count do distinguish unsharded (0)
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
count: req.tenant_id.shard_count,
// We only import un-sharded or single-sharded tenants, so stripe
// size can be made up arbitrarily here.
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
},
config: req.config.tenant_conf,
})
} else {
None
}
};
let waiters = if let Some(create_req) = maybe_create {
let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
result.shards = create_resp
.shards
.into_iter()
.map(|s| TenantShardLocation {
node_id: s.node_id,
shard_id: s.shard_id,
})
.collect();
waiters
} else {
waiters
};
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
@@ -1501,91 +1375,6 @@ impl Service {
Ok(result)
}
pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
let tenant_id = req.tenant_id;
let config = req.config;
self.persistence
.update_tenant_config(req.tenant_id, config.clone())
.await?;
let waiters = {
let mut waiters = Vec::new();
let mut locked = self.inner.write().unwrap();
let result_tx = locked.result_tx.clone();
let compute_hook = locked.compute_hook.clone();
let (nodes, tenants, _scheduler) = locked.parts_mut();
for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
shard.config = config.clone();
if let Some(waiter) = shard.maybe_reconcile(
result_tx.clone(),
nodes,
&compute_hook,
&self.config,
&self.persistence,
&self.gate,
&self.cancel,
) {
waiters.push(waiter);
}
}
waiters
};
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
// Treat this as success because we have stored the configuration. If e.g.
// a node was unavailable at this time, it should not stop us accepting a
// configuration change.
tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
}
Ok(())
}
pub(crate) fn tenant_config_get(
&self,
tenant_id: TenantId,
) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
let config = {
let locked = self.inner.read().unwrap();
match locked
.tenants
.range(TenantShardId::tenant_range(tenant_id))
.next()
{
Some((_tenant_shard_id, shard)) => shard.config.clone(),
None => {
return Err(ApiError::NotFound(
anyhow::anyhow!("Tenant not found").into(),
))
}
}
};
// Unlike the pageserver, we do not have a set of global defaults: the config is
// entirely per-tenant. Therefore the distinction between `tenant_specific_overrides`
// and `effective_config` in the response is meaningless, but we retain that syntax
// in order to remain compatible with the pageserver API.
let response = HashMap::from([
(
"tenant_specific_overrides",
serde_json::to_value(&config)
.context("serializing tenant specific overrides")
.map_err(ApiError::InternalServerError)?,
),
(
"effective_config",
serde_json::to_value(&config)
.context("serializing effective config")
.map_err(ApiError::InternalServerError)?,
),
]);
Ok(response)
}
pub(crate) async fn tenant_time_travel_remote_storage(
&self,
time_travel_req: &TenantTimeTravelRequest,
@@ -1671,60 +1460,6 @@ impl Service {
})?;
}
}
Ok(())
}
pub(crate) async fn tenant_secondary_download(
&self,
tenant_id: TenantId,
) -> Result<(), ApiError> {
// Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
let targets = {
let locked = self.inner.read().unwrap();
let mut targets = Vec::new();
for (tenant_shard_id, shard) in
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
{
for node_id in shard.intent.get_secondary() {
let node = locked
.nodes
.get(node_id)
.expect("Pageservers may not be deleted while referenced");
targets.push((*tenant_shard_id, node.clone()));
}
}
targets
};
// TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
// downloads, they can return a clean 202 response instead of the HTTP client timing out.
// Issue concurrent requests to all shards' locations
let mut futs = FuturesUnordered::new();
for (tenant_shard_id, node) in targets {
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
futs.push(async move {
let result = client.tenant_secondary_download(tenant_shard_id).await;
(result, node)
})
}
// Handle any errors returned by pageservers. This includes cases like this request racing with
// a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
// well as more general cases like 503s, 500s, or timeouts.
while let Some((result, node)) = futs.next().await {
let Err(e) = result else { continue };
// Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
// is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
// than they had hoped for.
tracing::warn!(
"Ignoring tenant secondary download error from pageserver {}: {e}",
node.id,
);
}
Ok(())
}
@@ -2304,8 +2039,8 @@ impl Service {
// Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
// populate the correct generation as part of its transaction, to protect us
// against racing with changes in the state of the parent.
generation: None,
generation_pageserver: Some(target.node.id.0 as i64),
generation: 0,
generation_pageserver: target.node.id.0 as i64,
placement_policy: serde_json::to_string(&policy).unwrap(),
// TODO: get the config out of the map
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -2426,8 +2161,7 @@ impl Service {
.expect("It was present, we just split it");
let old_attached = old_state.intent.get_attached().unwrap();
old_state.intent.clear(scheduler);
let generation = old_state.generation.expect("Shard must have been attached");
(old_attached, generation, old_state.config.clone())
(old_attached, old_state.generation, old_state.config.clone())
};
for child in child_ids {
@@ -2448,7 +2182,7 @@ impl Service {
child_state.observed = ObservedState {
locations: child_observed,
};
child_state.generation = Some(generation);
child_state.generation = generation;
child_state.config = config.clone();
// The child's TenantState::splitting is intentionally left at the default value of Idle,
@@ -2513,7 +2247,6 @@ impl Service {
match shard.policy {
PlacementPolicy::Single => {
shard.intent.clear_secondary(scheduler);
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
}
PlacementPolicy::Double(_n) => {
// If our new attached node was a secondary, it no longer should be.
@@ -2523,12 +2256,6 @@ impl Service {
if let Some(old_attached) = old_attached {
shard.intent.push_secondary(scheduler, old_attached);
}
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
}
PlacementPolicy::Secondary => {
shard.intent.clear(scheduler);
shard.intent.push_secondary(scheduler, migrate_req.node_id);
}
PlacementPolicy::Detached => {
return Err(ApiError::BadRequest(anyhow::anyhow!(
@@ -2536,6 +2263,9 @@ impl Service {
)))
}
}
shard
.intent
.set_attached(scheduler, Some(migrate_req.node_id));
tracing::info!("Migrating: new intent {:?}", shard.intent);
shard.sequence = shard.sequence.next();
@@ -2863,7 +2593,7 @@ impl Service {
observed_loc.conf = None;
}
if tenant_state.intent.demote_attached(config_req.node_id) {
if tenant_state.intent.notify_offline(config_req.node_id) {
tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(scheduler) {
Err(e) => {
@@ -2930,9 +2660,6 @@ impl Service {
/// Helper for methods that will try and call pageserver APIs for
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
/// is attached somewhere.
///
/// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
/// an attached policy. We should error out if it isn't.
fn ensure_attached_schedule(
&self,
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,

View File

@@ -53,11 +53,8 @@ pub(crate) struct TenantState {
pub(crate) sequence: Sequence,
// Latest generation number: next time we attach, increment this
// and use the incremented number when attaching.
//
// None represents an incompletely onboarded tenant via the [`Service::location_config`]
// API, where this tenant may only run in PlacementPolicy::Secondary.
pub(crate) generation: Option<Generation>,
// and use the incremented number when attaching
pub(crate) generation: Generation,
// High level description of how the tenant should be set up. Provided
// externally.
@@ -184,13 +181,6 @@ impl IntentState {
}
}
/// Remove the last secondary node from the list of secondaries
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
if let Some(node_id) = self.secondary.pop() {
scheduler.node_dec_ref(node_id);
}
}
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
if let Some(old_attached) = self.attached.take() {
scheduler.node_dec_ref(old_attached);
@@ -218,13 +208,11 @@ impl IntentState {
&self.secondary
}
/// If the node is in use as the attached location, demote it into
/// the list of secondary locations. This is used when a node goes offline,
/// and we want to use a different node for attachment, but not permanently
/// forget the location on the offline node.
/// When a node goes offline, we update intents to avoid using it
/// as their attached pageserver.
///
/// Returns true if a change was made
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
if self.attached == Some(node_id) {
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
// need to call into it here.
@@ -327,7 +315,7 @@ pub(crate) struct ReconcileResult {
pub(crate) result: Result<(), ReconcileError>,
pub(crate) tenant_shard_id: TenantShardId,
pub(crate) generation: Option<Generation>,
pub(crate) generation: Generation,
pub(crate) observed: ObservedState,
/// Set [`TenantState::pending_compute_notification`] from this flag
@@ -352,7 +340,7 @@ impl TenantState {
tenant_shard_id,
policy,
intent: IntentState::default(),
generation: Some(Generation::new(0)),
generation: Generation::new(0),
shard,
observed: ObservedState::default(),
config: TenantConfig::default(),
@@ -450,16 +438,10 @@ impl TenantState {
// more work on the same pageservers we're already using.
let mut modified = false;
// Add/remove nodes to fulfil policy
use PlacementPolicy::*;
match self.policy {
Single => {
// Should have exactly one attached, and zero secondaries
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
@@ -469,23 +451,6 @@ impl TenantState {
}
}
Double(secondary_count) => {
let retain_secondaries = if self.intent.attached.is_none()
&& scheduler.node_preferred(&self.intent.secondary).is_some()
{
// If we have no attached, and one of the secondaries is elegible to be promoted, retain
// one more secondary than we usually would, as one of them will become attached futher down this function.
secondary_count + 1
} else {
secondary_count
};
while self.intent.secondary.len() > retain_secondaries {
// We have no particular preference for one secondary location over another: just
// arbitrarily drop from the end
self.intent.pop_secondary(scheduler);
modified = true;
}
// Should have exactly one attached, and N secondaries
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
modified |= modified_attached;
@@ -498,28 +463,15 @@ impl TenantState {
modified = true;
}
}
Secondary => {
if let Some(node_id) = self.intent.get_attached() {
// Populate secondary by demoting the attached node
self.intent.demote_attached(*node_id);
modified = true;
} else if self.intent.secondary.is_empty() {
// Populate secondary by scheduling a fresh node
let node_id = scheduler.schedule_shard(&[])?;
self.intent.push_secondary(scheduler, node_id);
modified = true;
}
while self.intent.secondary.len() > 1 {
// We have no particular preference for one secondary location over another: just
// arbitrarily drop from the end
self.intent.pop_secondary(scheduler);
modified = true;
}
}
Detached => {
// Never add locations in this mode
if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
self.intent.clear(scheduler);
// Should have no attached or secondary pageservers
if self.intent.attached.is_some() {
self.intent.set_attached(scheduler, None);
modified = true;
}
if !self.intent.secondary.is_empty() {
self.intent.clear_secondary(scheduler);
modified = true;
}
}
@@ -566,12 +518,7 @@ impl TenantState {
fn dirty(&self) -> bool {
if let Some(node_id) = self.intent.attached {
// Maybe panic: it is a severe bug if we try to attach while generation is null.
let generation = self
.generation
.expect("Attempted to enter attached state without a generation");
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
Some(_) | None => {
@@ -649,10 +596,6 @@ impl TenantState {
// Reconcile already in flight for the current sequence?
if let Some(handle) = &self.reconciler {
if handle.sequence == self.sequence {
tracing::info!(
"Reconciliation already in progress for sequence {:?}",
self.sequence,
);
return Some(ReconcilerWaiter {
tenant_shard_id: self.tenant_shard_id,
seq_wait: self.waiter.clone(),
@@ -672,10 +615,6 @@ impl TenantState {
return None;
};
// Advance the sequence before spawning a reconciler, so that sequence waiters
// can distinguish between before+after the reconcile completes.
self.sequence = self.sequence.next();
let reconciler_cancel = cancel.child_token();
let mut reconciler = Reconciler {
tenant_shard_id: self.tenant_shard_id,
@@ -777,17 +716,6 @@ impl TenantState {
})
}
/// Called when a ReconcileResult has been emitted and the service is updating
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
/// the handle to indicate there is no longer a reconciliation in progress.
pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
if let Some(reconcile_handle) = &self.reconciler {
if reconcile_handle.sequence <= sequence {
self.reconciler = None;
}
}
}
// If we had any state at all referring to this node ID, drop it. Does not
// attempt to reschedule.
pub(crate) fn deref_node(&mut self, node_id: NodeId) {
@@ -808,8 +736,13 @@ impl TenantState {
shard_number: self.tenant_shard_id.shard_number.0 as i32,
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: self.shard.stripe_size.0 as i32,
generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
generation: self.generation.into().unwrap_or(0) as i32,
generation_pageserver: self
.intent
.get_attached()
.map(|n| n.0 as i64)
.unwrap_or(i64::MAX),
placement_policy: serde_json::to_string(&self.policy).unwrap(),
config: serde_json::to_string(&self.config).unwrap(),
splitting: SplitState::default(),
@@ -872,10 +805,8 @@ pub(crate) mod tests {
assert_ne!(attached_node_id, secondary_node_id);
// Notifying the attached node is offline should demote it to a secondary
let changed = tenant_state.intent.demote_attached(attached_node_id);
let changed = tenant_state.intent.notify_offline(attached_node_id);
assert!(changed);
assert!(tenant_state.intent.attached.is_none());
assert_eq!(tenant_state.intent.secondary.len(), 2);
// Update the scheduler state to indicate the node is offline
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;

View File

@@ -200,7 +200,7 @@ impl AttachmentService {
"localhost",
"-p",
&format!("{}", self.postgres_port),
DB_NAME,
&DB_NAME,
])
.output()
.await

View File

@@ -605,7 +605,7 @@ impl Endpoint {
let conn_str = self.connstr("cloud_admin", "postgres");
println!("Starting postgres node at '{}'", conn_str);
if create_test_user {
let conn_str = self.connstr("test", "neondb");
let conn_str = self.connstr("user", "neondb");
println!("Also at '{}'", conn_str);
}
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));

View File

@@ -14,6 +14,7 @@ use byteorder::{BigEndian, ReadBytesExt};
use postgres_ffi::BLCKSZ;
use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use strum_macros;
use utils::{
completion,
history_buffer::HistoryBufferWithDropCounter,
@@ -1076,6 +1077,7 @@ impl PagestreamBeMessage {
#[cfg(test)]
mod tests {
use bytes::Buf;
use serde_json::json;
use super::*;

View File

@@ -6,6 +6,7 @@ use crate::{
};
use hex::FromHex;
use serde::{Deserialize, Serialize};
use thiserror;
use utils::id::TenantId;
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
@@ -655,7 +656,10 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
#[cfg(test)]
mod tests {
use utils::Hex;
use std::str::FromStr;
use bincode;
use utils::{id::TenantId, Hex};
use super::*;

View File

@@ -623,7 +623,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
mod fs_tests {
use super::*;
use bytes::Bytes;
use camino_tempfile::tempdir;
use futures_util::Stream;
use std::{collections::HashMap, io::Write};
async fn read_and_check_metadata(

View File

@@ -1040,7 +1040,7 @@ mod tests {
Some("test/prefix/"),
Some("/test/prefix/"),
];
let expected_outputs = [
let expected_outputs = vec![
vec!["", "some/path", "some/path"],
vec!["/", "/some/path", "/some/path"],
vec![

View File

@@ -1,6 +1,7 @@
// For details about authentication see docs/authentication.md
use arc_swap::ArcSwap;
use serde;
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
use anyhow::Result;

View File

@@ -4,9 +4,7 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
///
/// Can be cloned, moved and kept around in futures as "guard objects".
#[derive(Clone)]
pub struct Completion {
_token: TaskTrackerToken,
}
pub struct Completion(TaskTrackerToken);
/// Barrier will wait until all clones of [`Completion`] have been dropped.
#[derive(Clone)]
@@ -51,5 +49,5 @@ pub fn channel() -> (Completion, Barrier) {
tracker.close();
let token = tracker.token();
(Completion { _token: token }, Barrier(tracker))
(Completion(token), Barrier(tracker))
}

View File

@@ -45,7 +45,7 @@ impl Generation {
Self::Broken
}
pub const fn new(v: u32) -> Self {
pub fn new(v: u32) -> Self {
Self::Valid(v)
}

View File

@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
use once_cell::sync::Lazy;
use routerify::ext::RequestExt;
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
use tracing::{debug, info, info_span, warn, Instrument};
use tracing::{self, debug, info, info_span, warn, Instrument};
use std::future::Future;
use std::str::FromStr;
@@ -156,10 +156,6 @@ pub struct ChannelWriter {
buffer: BytesMut,
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
written: usize,
/// Time spent waiting for the channel to make progress. It is not the same as time to upload a
/// buffer because we cannot know anything about that, but this should allow us to understand
/// the actual time taken without the time spent `std::thread::park`ed.
wait_time: std::time::Duration,
}
impl ChannelWriter {
@@ -172,7 +168,6 @@ impl ChannelWriter {
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
tx,
written: 0,
wait_time: std::time::Duration::ZERO,
}
}
@@ -185,8 +180,6 @@ impl ChannelWriter {
tracing::trace!(n, "flushing");
let ready = self.buffer.split().freeze();
let wait_started_at = std::time::Instant::now();
// not ideal to call from blocking code to block_on, but we are sure that this
// operation does not spawn_blocking other tasks
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
@@ -199,9 +192,6 @@ impl ChannelWriter {
// sending it to the client.
Ok(())
});
self.wait_time += wait_started_at.elapsed();
if res.is_err() {
return Err(std::io::ErrorKind::BrokenPipe.into());
}
@@ -212,10 +202,6 @@ impl ChannelWriter {
pub fn flushed_bytes(&self) -> usize {
self.written
}
pub fn wait_time(&self) -> std::time::Duration {
self.wait_time
}
}
impl std::io::Write for ChannelWriter {
@@ -266,52 +252,22 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
let span = info_span!("blocking");
tokio::task::spawn_blocking(move || {
// there are situations where we lose scraped metrics under load, try to gather some clues
// since all nodes are queried this, keep the message count low.
let spawned_at = std::time::Instant::now();
let _span = span.entered();
let metrics = metrics::gather();
let gathered_at = std::time::Instant::now();
let res = encoder
.encode(&metrics, &mut writer)
.and_then(|_| writer.flush().map_err(|e| e.into()));
// this instant is not when we finally got the full response sent, sending is done by hyper
// in another task.
let encoded_at = std::time::Instant::now();
let spawned_in = spawned_at - started_at;
let collected_in = gathered_at - spawned_at;
// remove the wait time here in case the tcp connection was clogged
let encoded_in = encoded_at - gathered_at - writer.wait_time();
let total = encoded_at - started_at;
match res {
Ok(()) => {
tracing::info!(
bytes = writer.flushed_bytes(),
total_ms = total.as_millis(),
spawning_ms = spawned_in.as_millis(),
collection_ms = collected_in.as_millis(),
encoding_ms = encoded_in.as_millis(),
elapsed_ms = started_at.elapsed().as_millis(),
"responded /metrics"
);
}
Err(e) => {
// there is a chance that this error is not the BrokenPipe we generate in the writer
// for "closed connection", but it is highly unlikely.
tracing::warn!(
after_bytes = writer.flushed_bytes(),
total_ms = total.as_millis(),
spawning_ms = spawned_in.as_millis(),
collection_ms = collected_in.as_millis(),
encoding_ms = encoded_in.as_millis(),
"failed to write out /metrics response: {e:?}"
);
tracing::warn!("failed to write out /metrics response: {e:#}");
// semantics of this error are quite... unclear. we want to error the stream out to
// abort the response to somehow notify the client that we failed.
//

View File

@@ -415,6 +415,7 @@ mod tests {
use super::*;
use serde::ser::Serialize;
use serde_assert::{Deserializer, Serializer, Token, Tokens};
#[test]

View File

@@ -1,6 +1,6 @@
#![warn(missing_docs)]
use std::cmp::{Eq, Ordering};
use std::cmp::{Eq, Ordering, PartialOrd};
use std::collections::BinaryHeap;
use std::fmt::Debug;
use std::mem;
@@ -249,6 +249,7 @@ where
mod tests {
use super::*;
use std::sync::Arc;
use std::time::Duration;
impl MonotonicCounter<i32> for i32 {
fn cnt_advance(&mut self, val: i32) {

View File

@@ -221,7 +221,7 @@ impl RcuWaitList {
#[cfg(test)]
mod tests {
use super::*;
use std::sync::Mutex;
use std::sync::{Arc, Mutex};
use std::time::Duration;
#[tokio::test]

View File

@@ -239,6 +239,7 @@ mod tests {
use std::{
convert::Infallible,
pin::{pin, Pin},
sync::atomic::{AtomicUsize, Ordering},
time::Duration,
};

View File

@@ -6,6 +6,7 @@ use futures::future::BoxFuture;
use futures::{Stream, StreamExt};
use itertools::Itertools;
use pin_project_lite::pin_project;
use std::cmp::Ord;
use std::collections::BinaryHeap;
use std::collections::VecDeque;
use std::future::Future;

View File

@@ -20,6 +20,7 @@ use std::num::NonZeroUsize;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use toml_edit;
use toml_edit::{Document, Item};
use camino::{Utf8Path, Utf8PathBuf};
@@ -211,9 +212,9 @@ pub struct PageServerConf {
pub log_format: LogFormat,
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
///
/// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
/// loading such tenants, vs. other work in the system.
pub concurrent_tenant_warmup: ConfigurableSemaphore,
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
@@ -1202,7 +1203,10 @@ impl ConfigurableSemaphore {
#[cfg(test)]
mod tests {
use std::{fs, num::NonZeroU32};
use std::{
fs,
num::{NonZeroU32, NonZeroUsize},
};
use camino_tempfile::{tempdir, Utf8TempDir};
use pageserver_api::models::EvictionPolicy;

View File

@@ -1,5 +1,7 @@
use super::*;
use std::collections::HashMap;
use std::time::SystemTime;
use utils::lsn::Lsn;
#[test]
fn startup_collected_timeline_metrics_before_advancing() {

View File

@@ -20,9 +20,10 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
use serde::Deserialize;
use serde::Serialize;
use thiserror::Error;
use tokio;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use tracing::{debug, error};
use tracing::{self, debug, error};
use utils::crashsafe::path_with_suffix_extension;
use utils::generation::Generation;
use utils::id::TimelineId;
@@ -725,7 +726,7 @@ mod test {
use camino::Utf8Path;
use hex_literal::hex;
use pageserver_api::shard::ShardIndex;
use std::io::ErrorKind;
use std::{io::ErrorKind, time::Duration};
use tracing::info;
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -734,7 +735,10 @@ mod test {
use crate::{
control_plane_client::RetryForeverError,
repository::Key,
tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
tenant::{
harness::TenantHarness, remote_timeline_client::remote_timeline_path,
storage_layer::DeltaFileName,
},
};
use super::*;
@@ -1157,8 +1161,13 @@ mod test {
pub(crate) mod mock {
use tracing::info;
use crate::tenant::remote_timeline_client::remote_layer_path;
use super::*;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
pub struct ConsumerState {
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,

View File

@@ -58,7 +58,6 @@ use utils::{completion, id::TimelineId};
use crate::{
config::PageServerConf,
metrics::disk_usage_based_eviction::METRICS,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
self,
@@ -66,6 +65,7 @@ use crate::{
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
Timeline,
},
};
@@ -409,23 +409,13 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
"running disk usage based eviction due to pressure"
);
let (candidates, collection_time) = {
let started_at = std::time::Instant::now();
let candidates =
match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
EvictionCandidates::Cancelled => {
return Ok(IterationOutcome::Cancelled);
}
EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
}
};
METRICS.layers_collected.inc_by(candidates.len() as u64);
tracing::info!(
elapsed_ms = collection_time.as_millis(),
total_layers = candidates.len(),
"collection completed"
);
EvictionCandidates::Finished(partitioned) => partitioned,
};
// Debug-log the list of candidates
let now = SystemTime::now();
@@ -456,10 +446,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
let (evicted_amount, usage_planned) =
select_victims(&candidates, usage_pre).into_amount_and_planned();
let selection = select_victims(&candidates, usage_pre);
METRICS.layers_selected.inc_by(evicted_amount as u64);
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
// phase2: evict layers
@@ -488,15 +477,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
if let Some(next) = next {
match next {
Ok(Ok(file_size)) => {
METRICS.layers_evicted.inc();
usage_assumed.add_available_bytes(file_size);
}
Ok(Err((
file_size,
EvictionError::NotFound
| EvictionError::Downloaded
| EvictionError::Timeout,
))) => {
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
evictions_failed.file_sizes += file_size;
evictions_failed.count += 1;
}
@@ -512,10 +495,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// calling again when consumed_all is fine as evicted is fused.
let Some((_partition, candidate)) = evicted.next() else {
if !consumed_all {
tracing::info!("all evictions started, waiting");
consumed_all = true;
}
consumed_all = true;
continue;
};
@@ -523,15 +503,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
EvictionLayer::Attached(layer) => {
let file_size = layer.layer_desc().file_size;
js.spawn(async move {
// have a low eviction waiting timeout because our LRU calculations go stale fast;
// also individual layer evictions could hang because of bugs and we do not want to
// pause disk_usage_based_eviction for such.
let timeout = std::time::Duration::from_secs(5);
match layer.evict_and_wait(timeout).await {
Ok(()) => Ok(file_size),
Err(e) => Err((file_size, e)),
}
layer
.evict_and_wait()
.await
.map(|()| file_size)
.map_err(|e| (file_size, e))
});
}
EvictionLayer::Secondary(layer) => {
@@ -553,30 +529,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
(usage_assumed, evictions_failed)
};
let started_at = std::time::Instant::now();
let evict_layers = async move {
let mut evict_layers = std::pin::pin!(evict_layers);
let maximum_expected = std::time::Duration::from_secs(10);
let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
let tuple = if let Ok(tuple) = res {
tuple
} else {
let elapsed = started_at.elapsed();
tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
evict_layers.await
};
let elapsed = started_at.elapsed();
tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
tuple
};
let evict_layers =
evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
let (usage_assumed, evictions_failed) = tokio::select! {
tuple = evict_layers => { tuple },
_ = cancel.cancelled() => {
@@ -811,8 +763,6 @@ async fn collect_eviction_candidates(
eviction_order: EvictionOrder,
cancel: &CancellationToken,
) -> anyhow::Result<EvictionCandidates> {
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
// get a snapshot of the list of tenants
let tenants = tenant::mgr::list_tenants()
.await
@@ -841,8 +791,6 @@ async fn collect_eviction_candidates(
continue;
}
let started_at = std::time::Instant::now();
// collect layers from all timelines in this tenant
//
// If one of the timelines becomes `!is_active()` during the iteration,
@@ -857,7 +805,6 @@ async fn collect_eviction_candidates(
}
let info = tl.get_local_layers_for_disk_usage_eviction().await;
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
tenant_candidates.extend(info.resident_layers.into_iter());
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
@@ -923,25 +870,7 @@ async fn collect_eviction_candidates(
(partition, candidate)
});
METRICS
.tenant_layer_count
.observe(tenant_candidates.len() as f64);
candidates.extend(tenant_candidates);
let elapsed = started_at.elapsed();
METRICS
.tenant_collection_time
.observe(elapsed.as_secs_f64());
if elapsed > LOG_DURATION_THRESHOLD {
tracing::info!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
elapsed_ms = elapsed.as_millis(),
"collection took longer than threshold"
);
}
}
// Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -956,11 +885,11 @@ async fn collect_eviction_candidates(
},
);
for tenant in secondary_tenants {
for secondary_tenant in secondary_tenants {
// for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
// to prevent repeated disk usage based evictions from completely draining less often
// updating secondaries.
let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
debug_assert!(
total_layers >= layer_info.resident_layers.len(),
@@ -968,8 +897,6 @@ async fn collect_eviction_candidates(
layer_info.resident_layers.len()
);
let started_at = std::time::Instant::now();
layer_info
.resident_layers
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
@@ -991,27 +918,9 @@ async fn collect_eviction_candidates(
)
});
METRICS
.tenant_layer_count
.observe(tenant_candidates.len() as f64);
candidates.extend(tenant_candidates);
tokio::task::yield_now().await;
let elapsed = started_at.elapsed();
METRICS
.tenant_collection_time
.observe(elapsed.as_secs_f64());
if elapsed > LOG_DURATION_THRESHOLD {
tracing::info!(
tenant_id=%tenant.tenant_shard_id().tenant_id,
shard_id=%tenant.tenant_shard_id().shard_slug(),
elapsed_ms = elapsed.as_millis(),
"collection took longer than threshold"
);
}
}
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
@@ -1088,6 +997,30 @@ impl<U: Usage> VictimSelection<U> {
}
}
struct TimelineKey(Arc<Timeline>);
impl PartialEq for TimelineKey {
fn eq(&self, other: &Self) -> bool {
Arc::ptr_eq(&self.0, &other.0)
}
}
impl Eq for TimelineKey {}
impl std::hash::Hash for TimelineKey {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
Arc::as_ptr(&self.0).hash(state);
}
}
impl std::ops::Deref for TimelineKey {
type Target = Timeline;
fn deref(&self) -> &Self::Target {
self.0.as_ref()
}
}
/// A totally ordered f32 subset we can use with sorting functions.
pub(crate) mod finite_f32 {

View File

@@ -579,12 +579,6 @@ paths:
required: false
schema:
type: integer
- name: lazy
in: query
required: false
schema:
type: boolean
description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
put:
description: |
Configures a _tenant location_, that is how a particular pageserver handles

View File

@@ -816,7 +816,13 @@ async fn tenant_attach_handler(
let tenant = state
.tenant_manager
.upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
.upsert_location(
tenant_shard_id,
location_conf,
None,
SpawnMode::Normal,
&ctx,
)
.await?;
let Some(tenant) = tenant else {
@@ -1412,7 +1418,6 @@ async fn put_tenant_location_config_handler(
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1443,17 +1448,15 @@ async fn put_tenant_location_config_handler(
let location_conf =
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
// lazy==true queues up for activation or jumps the queue like normal when a compute connects,
// similar to at startup ordering.
let spawn_mode = if lazy {
tenant::SpawnMode::Lazy
} else {
tenant::SpawnMode::Eager
};
let attached = state
.tenant_manager
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
.upsert_location(
tenant_shard_id,
location_conf,
flush,
tenant::SpawnMode::Normal,
&ctx,
)
.await?
.is_some();

View File

@@ -1915,16 +1915,17 @@ impl Drop for TimelineMetrics {
let tenant_id = &self.tenant_id;
let timeline_id = &self.timeline_id;
let shard_id = &self.shard_id;
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
{
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ =
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
}
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
}
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
self.evictions_with_low_residence_duration
.write()
@@ -2473,64 +2474,6 @@ pub(crate) mod tenant_throttling {
}
}
pub(crate) mod disk_usage_based_eviction {
use super::*;
pub(crate) struct Metrics {
pub(crate) tenant_collection_time: Histogram,
pub(crate) tenant_layer_count: Histogram,
pub(crate) layers_collected: IntCounter,
pub(crate) layers_selected: IntCounter,
pub(crate) layers_evicted: IntCounter,
}
impl Default for Metrics {
fn default() -> Self {
let tenant_collection_time = register_histogram!(
"pageserver_disk_usage_based_eviction_tenant_collection_seconds",
"Time spent collecting layers from a tenant -- not normalized by collected layer amount",
vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
)
.unwrap();
let tenant_layer_count = register_histogram!(
"pageserver_disk_usage_based_eviction_tenant_collected_layers",
"Amount of layers gathered from a tenant",
vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
)
.unwrap();
let layers_collected = register_int_counter!(
"pageserver_disk_usage_based_eviction_collected_layers_total",
"Amount of layers collected"
)
.unwrap();
let layers_selected = register_int_counter!(
"pageserver_disk_usage_based_eviction_select_layers_total",
"Amount of layers selected"
)
.unwrap();
let layers_evicted = register_int_counter!(
"pageserver_disk_usage_based_eviction_evicted_layers_total",
"Amount of layers successfully evicted"
)
.unwrap();
Self {
tenant_collection_time,
tenant_layer_count,
layers_collected,
layers_selected,
layers_evicted,
}
}
}
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
}
pub fn preinitialize_metrics() {
// Python tests need these and on some we do alerting.
//
@@ -2565,7 +2508,6 @@ pub fn preinitialize_metrics() {
Lazy::force(&TENANT_MANAGER);
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
Lazy::force(&disk_usage_based_eviction::METRICS);
// countervecs
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]

View File

@@ -73,6 +73,7 @@
use std::{
collections::{hash_map::Entry, HashMap},
convert::TryInto,
sync::{
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
Arc, Weak,
@@ -261,9 +262,7 @@ pub struct PageCache {
size_metrics: &'static PageCacheSizeMetrics,
}
struct PinnedSlotsPermit {
_permit: tokio::sync::OwnedSemaphorePermit,
}
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
///
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
@@ -559,9 +558,9 @@ impl PageCache {
)
.await
{
Ok(res) => Ok(PinnedSlotsPermit {
_permit: res.expect("this semaphore is never closed"),
}),
Ok(res) => Ok(PinnedSlotsPermit(
res.expect("this semaphore is never closed"),
)),
Err(_timeout) => {
crate::metrics::page_cache_errors_inc(
crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,

View File

@@ -27,7 +27,7 @@ use pageserver_api::models::{
};
use pageserver_api::shard::ShardIndex;
use pageserver_api::shard::ShardNumber;
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
use pq_proto::framed::ConnectionError;
use pq_proto::FeStartupPacket;
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
@@ -44,6 +44,7 @@ use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::io::StreamReader;
use tokio_util::sync::CancellationToken;
use tracing::field;
use tracing::*;
use utils::id::ConnectionId;
use utils::sync::gate::GateGuard;
@@ -1114,10 +1115,7 @@ impl PageServerHandler {
ctx: &RequestContext,
) -> Result<PagestreamBeMessage, PageStreamError> {
let timeline = match self.get_cached_timeline_for_page(req) {
Ok(tl) => {
set_tracing_field_shard_id(tl);
tl
}
Ok(tl) => tl,
Err(key) => {
match self
.load_timeline_for_page(tenant_id, timeline_id, key)
@@ -1142,6 +1140,9 @@ impl PageServerHandler {
}
};
// load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
set_tracing_field_shard_id(timeline);
let _timer = timeline
.query_metrics
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);

View File

@@ -37,6 +37,7 @@ impl Value {
mod test {
use super::*;
use bytes::Bytes;
use utils::bin_ser::BeSer;
macro_rules! roundtrip {

View File

@@ -109,6 +109,7 @@ pub use pageserver_api::models::TenantState;
use tokio::sync::Semaphore;
static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
use toml_edit;
use utils::{
crashsafe,
generation::Generation,
@@ -226,11 +227,7 @@ pub(crate) struct TenantPreload {
/// When we spawn a tenant, there is a special mode for tenant creation that
/// avoids trying to read anything from remote storage.
pub(crate) enum SpawnMode {
/// Activate as soon as possible
Eager,
/// Lazy activation in the background, with the option to skip the queue if the need comes up
Lazy,
/// Tenant has been created during the lifetime of this process
Normal,
Create,
}
@@ -703,37 +700,41 @@ impl Tenant {
.and_then(|x| x.initial_tenant_load_remote.take());
enum AttachType<'a> {
/// We are attaching this tenant lazily in the background.
Warmup {
_permit: tokio::sync::SemaphorePermit<'a>,
during_startup: bool
},
/// We are attaching this tenant as soon as we can, because for example an
/// endpoint tried to access it.
// During pageserver startup, we are attaching this tenant lazily in the background
Warmup(tokio::sync::SemaphorePermit<'a>),
// During pageserver startup, we are attaching this tenant as soon as we can,
// because a client tried to access it.
OnDemand,
/// During normal operations after startup, we are attaching a tenant, and
/// eager attach was requested.
// During normal operations after startup, we are attaching a tenant.
Normal,
}
let attach_type = if matches!(mode, SpawnMode::Lazy) {
// Before doing any I/O, wait for at least one of:
// - A client attempting to access to this tenant (on-demand loading)
// - A permit becoming available in the warmup semaphore (background warmup)
// Before doing any I/O, wait for either or:
// - A client to attempt to access to this tenant (on-demand loading)
// - A permit to become available in the warmup semaphore (background warmup)
//
// Some-ness of init_order is how we know if we're attaching during startup or later
// in process lifetime.
let attach_type = if init_order.is_some() {
tokio::select!(
permit = tenant_clone.activate_now_sem.acquire() => {
let _ = permit.expect("activate_now_sem is never closed");
_ = tenant_clone.activate_now_sem.acquire() => {
tracing::info!("Activating tenant (on-demand)");
AttachType::OnDemand
},
permit = conf.concurrent_tenant_warmup.inner().acquire() => {
let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
tracing::info!("Activating tenant (warmup)");
AttachType::Warmup {
_permit,
during_startup: init_order.is_some()
permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
match permit_result {
Ok(p) => {
tracing::info!("Activating tenant (warmup)");
AttachType::Warmup(p)
}
Err(_) => {
// This is unexpected: the warmup semaphore should stay alive
// for the lifetime of init_order. Log a warning and proceed.
tracing::warn!("warmup_limit semaphore unexpectedly closed");
AttachType::Normal
}
}
}
_ = tenant_clone.cancel.cancelled() => {
// This is safe, but should be pretty rare: it is interesting if a tenant
@@ -748,8 +749,6 @@ impl Tenant {
},
)
} else {
// SpawnMode::{Create,Eager} always cause jumping ahead of the
// concurrent_tenant_warmup queue
AttachType::Normal
};
@@ -757,7 +756,7 @@ impl Tenant {
(SpawnMode::Create, _) => {
None
},
(SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
(SpawnMode::Normal, Some(remote_storage)) => {
let _preload_timer = TENANT.preload.start_timer();
let res = tenant_clone
.preload(remote_storage, task_mgr::shutdown_token())
@@ -770,7 +769,7 @@ impl Tenant {
}
}
}
(_, None) => {
(SpawnMode::Normal, None) => {
let _preload_timer = TENANT.preload.start_timer();
None
}
@@ -829,7 +828,7 @@ impl Tenant {
let attached = {
let _attach_timer = match mode {
SpawnMode::Create => None,
SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
};
tenant_clone.attach(preload, mode, &ctx).await
};
@@ -851,7 +850,7 @@ impl Tenant {
// It also prevents the warmup proccess competing with the concurrency limit on
// logical size calculations: if logical size calculation semaphore is saturated,
// then warmup will wait for that before proceeding to the next tenant.
if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
if let AttachType::Warmup(_permit) = attach_type {
let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
tracing::info!("Waiting for initial logical sizes while warming up...");
while futs.next().await.is_some() {}
@@ -924,7 +923,7 @@ impl Tenant {
deleting: false,
timelines: HashMap::new(),
},
(None, _) => {
(None, SpawnMode::Normal) => {
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
}
};
@@ -2383,7 +2382,7 @@ impl Tenant {
self.tenant_shard_id,
self.generation,
self.shard_identity,
self.walredo_mgr.clone(),
self.walredo_mgr.as_ref().map(Arc::clone),
resources,
pg_version,
state,
@@ -3592,18 +3591,25 @@ pub async fn dump_layerfile_from_path(
#[cfg(test)]
pub(crate) mod harness {
use bytes::{Bytes, BytesMut};
use camino::Utf8PathBuf;
use once_cell::sync::OnceCell;
use pageserver_api::models::ShardParameters;
use pageserver_api::shard::ShardIndex;
use std::fs;
use std::sync::Arc;
use utils::logging;
use utils::lsn::Lsn;
use crate::deletion_queue::mock::MockDeletionQueue;
use crate::walredo::apply_neon;
use crate::{repository::Key, walrecord::NeonWalRecord};
use crate::{
config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
};
use super::*;
use crate::tenant::config::{TenantConf, TenantConfOpt};
use hex_literal::hex;
use utils::id::TenantId;
use utils::id::{TenantId, TimelineId};
pub const TIMELINE_ID: TimelineId =
TimelineId::from_array(hex!("11223344556677881122334455667788"));
@@ -3763,7 +3769,7 @@ pub(crate) mod harness {
let preload = tenant
.preload(&self.remote_storage, CancellationToken::new())
.await?;
tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
tenant.state.send_replace(TenantState::Active);
for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3832,8 +3838,10 @@ mod tests {
use crate::DEFAULT_PG_VERSION;
use bytes::BytesMut;
use hex_literal::hex;
use once_cell::sync::Lazy;
use pageserver_api::keyspace::KeySpace;
use rand::{thread_rng, Rng};
use tokio_util::sync::CancellationToken;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));

View File

@@ -52,10 +52,7 @@ pub mod defaults {
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
// throughputs up to 1GiB/s per timeline.
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

View File

@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
.expect("cant be stopping or broken");
tenant
.attach(preload, super::SpawnMode::Eager, ctx)
.attach(preload, super::SpawnMode::Normal, ctx)
.await
.context("attach")?;

View File

@@ -21,6 +21,7 @@
use byteorder::{ReadBytesExt, BE};
use bytes::{BufMut, Bytes, BytesMut};
use either::Either;
use hex;
use std::{cmp::Ordering, io, result};
use thiserror::Error;
use tracing::error;
@@ -699,6 +700,8 @@ impl<const L: usize> BuildNode<L> {
#[cfg(test)]
pub(crate) mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
use rand::Rng;
use std::collections::BTreeMap;

View File

@@ -300,7 +300,7 @@ mod tests {
use super::*;
use crate::context::DownloadBehavior;
use crate::task_mgr::TaskKind;
use crate::tenant::block_io::BlockReaderRef;
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
use rand::{thread_rng, RngCore};
use std::fs;
use std::str::FromStr;

View File

@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
shard_identity,
Some(init_order.clone()),
&TENANTS,
SpawnMode::Lazy,
SpawnMode::Normal,
&ctx,
) {
Ok(tenant) => {
@@ -1106,9 +1106,9 @@ impl TenantManager {
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
// the caller thinks they're creating but the tenant already existed. We must switch to
// Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
// rather than assuming it to be empty.
spawn_mode = SpawnMode::Eager;
spawn_mode = SpawnMode::Normal;
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
@@ -1300,7 +1300,7 @@ impl TenantManager {
shard_identity,
None,
self.tenants,
SpawnMode::Eager,
SpawnMode::Normal,
ctx,
)?;
@@ -1521,7 +1521,7 @@ impl TenantManager {
*child_shard,
child_location_conf,
None,
SpawnMode::Eager,
SpawnMode::Normal,
ctx,
)
.await?;
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
shard_identity,
None,
&TENANTS,
SpawnMode::Eager,
SpawnMode::Normal,
ctx,
)
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(
let tenant = guard
.get(&tenant_shard_id)
.cloned()
.map(Arc::clone)
.with_context(|| format!("tenant {tenant_shard_id}"))
.map_err(|e| ApiError::NotFound(e.into()))?;

View File

@@ -1791,12 +1791,14 @@ mod tests {
context::RequestContext,
tenant::{
harness::{TenantHarness, TIMELINE_ID},
Tenant, Timeline,
storage_layer::Layer,
Generation, Tenant, Timeline,
},
DEFAULT_PG_VERSION,
};
use std::collections::HashSet;
use utils::lsn::Lsn;
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
format!("contents for {name}").into()

View File

@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
pub fn is_temp_download_file(path: &Utf8Path) -> bool {
let extension = path.extension();
match extension {
Some(TEMP_DOWNLOAD_EXTENSION) => true,

View File

@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};
enum DownloadCommand {
Download(TenantShardId),
@@ -121,10 +121,6 @@ impl SecondaryTenant {
})
}
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
self.tenant_shard_id
}
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();
@@ -168,17 +164,16 @@ impl SecondaryTenant {
self.detail.lock().unwrap().get_layers_for_eviction(self)
}
/// Cancellation safe, but on cancellation the eviction will go through
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
pub(crate) async fn evict_layer(
self: &Arc<Self>,
&self,
conf: &PageServerConf,
timeline_id: TimelineId,
name: LayerFileName,
) {
debug_assert_current_span_has_tenant_id();
let guard = match self.gate.enter() {
let _guard = match self.gate.enter() {
Ok(g) => g,
Err(_) => {
tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
@@ -192,57 +187,35 @@ impl SecondaryTenant {
.timeline_path(&self.tenant_shard_id, &timeline_id)
.join(name.file_name());
let this = self.clone();
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
tokio::fs::remove_file(path)
.await
.or_else(fs_ext::ignore_not_found)
.fatal_err("Deleting layer during eviction");
// spawn it to be cancellation safe
tokio::task::spawn_blocking(move || {
let _guard = guard;
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
// Other local I/O errors are process-fatal: these should never happen.
let deleted = std::fs::remove_file(path);
let not_found = deleted
.as_ref()
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
let deleted = if not_found {
false
} else {
deleted
.map(|()| true)
.fatal_err("Deleting layer during eviction")
};
if !deleted {
// skip updating accounting and putting perhaps later timestamp
return;
}
// Update the timeline's state. This does not have to be synchronized with
// the download process, because:
// - If downloader is racing with us to remove a file (e.g. because it is
// removed from heatmap), then our mutual .remove() operations will both
// succeed.
// - If downloader is racing with us to download the object (this would require
// multiple eviction iterations to race with multiple download iterations), then
// if we remove it from the state, the worst that happens is the downloader
// downloads it again before re-inserting, or we delete the file but it remains
// in the state map (in which case it will be downloaded if this secondary
// tenant transitions to attached and tries to access it)
//
// The important assumption here is that the secondary timeline state does not
// have to 100% match what is on disk, because it's a best-effort warming
// of the cache.
let mut detail = this.detail.lock().unwrap();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
timeline_detail.on_disk_layers.remove(&name);
timeline_detail.evicted_at.insert(name, now);
}
})
.await
.expect("secondary eviction should not have panicked");
// Update the timeline's state. This does not have to be synchronized with
// the download process, because:
// - If downloader is racing with us to remove a file (e.g. because it is
// removed from heatmap), then our mutual .remove() operations will both
// succeed.
// - If downloader is racing with us to download the object (this would require
// multiple eviction iterations to race with multiple download iterations), then
// if we remove it from the state, the worst that happens is the downloader
// downloads it again before re-inserting, or we delete the file but it remains
// in the state map (in which case it will be downloaded if this secondary
// tenant transitions to attached and tries to access it)
//
// The important assumption here is that the secondary timeline state does not
// have to 100% match what is on disk, because it's a best-effort warming
// of the cache.
let mut detail = self.detail.lock().unwrap();
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
timeline_detail.on_disk_layers.remove(&name);
timeline_detail.evicted_at.insert(name, now);
}
}
}

View File

@@ -16,8 +16,7 @@ use crate::{
config::SecondaryLocationConfig,
debug_assert_current_span_has_tenant_and_timeline_id,
remote_timeline_client::{
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::LayerFileName,
@@ -789,7 +788,7 @@ async fn init_timeline_state(
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
continue;
} else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
} else if crate::is_temporary(&file_path) {
// Temporary files are frequently left behind from restarting during downloads
tracing::info!("Cleaning up temporary file {file_path}");
if let Err(e) = tokio::fs::remove_file(&file_path)

View File

@@ -18,6 +18,7 @@ use crate::{
};
use futures::Future;
use md5;
use pageserver_api::shard::TenantShardId;
use rand::Rng;
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};

View File

@@ -72,7 +72,7 @@ where
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
/// call, to collect more records.
///
#[derive(Debug, Default)]
#[derive(Debug)]
pub struct ValueReconstructState {
pub records: Vec<(Lsn, NeonWalRecord)>,
pub img: Option<(Lsn, Bytes)>,

View File

@@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::{Bytes, BytesMut};
use camino::{Utf8Path, Utf8PathBuf};
use hex;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::TenantShardId;

View File

@@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex;
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
use std::time::{Duration, SystemTime};
use std::time::SystemTime;
use tracing::Instrument;
use utils::lsn::Lsn;
use utils::sync::heavier_once_cell;
@@ -208,15 +208,10 @@ impl Layer {
/// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
/// re-downloaded, [`EvictionError::Downloaded`] is returned.
///
/// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
/// will happen regardless the future returned by this method completing unless there is a
/// read access (currently including [`Layer::keep_resident`]) before eviction gets to
/// complete.
///
/// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
/// of download-evict cycle on retry.
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
self.0.evict_and_wait(timeout).await
pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
self.0.evict_and_wait().await
}
/// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
@@ -368,7 +363,7 @@ impl Layer {
///
/// Does not start local deletion, use [`Self::delete_on_drop`] for that
/// separatedly.
#[cfg(any(feature = "testing", test))]
#[cfg(feature = "testing")]
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
let mut rx = self.0.status.subscribe();
@@ -637,7 +632,7 @@ impl LayerInner {
/// Cancellation safe, however dropping the future and calling this method again might result
/// in a new attempt to evict OR join the previously started attempt.
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
use tokio::sync::broadcast::error::RecvError;
assert!(self.have_remote_client);
@@ -657,22 +652,16 @@ impl LayerInner {
if strong.is_some() {
// drop the DownloadedLayer outside of the holding the guard
drop(strong);
// idea here is that only one evicter should ever get to witness a strong reference,
// which means whenever get_or_maybe_download upgrades a weak, it must mark up a
// cancelled eviction and signal us, like it currently does.
//
// a second concurrent evict_and_wait will not see a strong reference.
LAYER_IMPL_METRICS.inc_started_evictions();
}
match tokio::time::timeout(timeout, rx.recv()).await {
Ok(Ok(Status::Evicted)) => Ok(()),
Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
Ok(Err(RecvError::Closed)) => {
match rx.recv().await {
Ok(Status::Evicted) => Ok(()),
Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
Err(RecvError::Closed) => {
unreachable!("sender cannot be dropped while we are in &self method")
}
Ok(Err(RecvError::Lagged(_))) => {
Err(RecvError::Lagged(_)) => {
// this is quite unlikely, but we are blocking a lot in the async context, so
// we might be missing this because we are stuck on a LIFO slot on a thread
// which is busy blocking for a 1TB database create_image_layers.
@@ -685,7 +674,6 @@ impl LayerInner {
None => Ok(()),
}
}
Err(_timeout) => Err(EvictionError::Timeout),
}
}
@@ -1207,9 +1195,6 @@ pub(crate) enum EvictionError {
/// Evictions must always lose to downloads in races, and this time it happened.
#[error("layer was downloaded instead")]
Downloaded,
#[error("eviction did not happen within timeout")]
Timeout,
}
/// Error internal to the [`LayerInner::get_or_maybe_download`]

View File

@@ -1,173 +1,13 @@
use futures::StreamExt;
use pageserver_api::key::CONTROLFILE_KEY;
use tokio::task::JoinSet;
use tracing::Instrument;
use utils::{
completion::{self, Completion},
id::TimelineId,
};
use super::*;
use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
/// Used in tests to advance a future to wanted await point, and not futher.
const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE
/// timeout uses to advance futures.
const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7);
/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
#[tokio::test]
async fn smoke_test() {
let handle = BACKGROUND_RUNTIME.handle();
let h = TenantHarness::create("smoke_test").unwrap();
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let (tenant, _) = h.load().await;
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
.unwrap();
let layer = {
let mut layers = {
let layers = timeline.layers.read().await;
layers.resident_layers().collect::<Vec<_>>().await
};
assert_eq!(layers.len(), 1);
layers.swap_remove(0)
};
// all layers created at pageserver are like `layer`, initialized with strong
// Arc<DownloadedLayer>.
let img_before = {
let mut data = ValueReconstructState::default();
layer
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
.await
.unwrap();
data.img
.take()
.expect("tenant harness writes the control file")
};
// important part is evicting the layer, which can be done when there are no more ResidentLayer
// instances -- there currently are none, only two `Layer` values, one in the layermap and on
// in scope.
layer.evict_and_wait(FOREVER).await.unwrap();
// double-evict returns an error, which is valid if both eviction_task and disk usage based
// eviction would both evict the same layer at the same time.
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
assert!(matches!(e, EvictionError::NotFound));
// on accesses when the layer is evicted, it will automatically be downloaded.
let img_after = {
let mut data = ValueReconstructState::default();
layer
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
.instrument(download_span.clone())
.await
.unwrap();
data.img.take().unwrap()
};
assert_eq!(img_before, img_after);
// evict_and_wait can timeout, but it doesn't cancel the evicting itself
//
// ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
// artificially slow it down.
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
match layer
.evict_and_wait(std::time::Duration::ZERO)
.await
.unwrap_err()
{
EvictionError::Timeout => {
// expected, but note that the eviction is "still ongoing"
helper.release().await;
// exhaust spawn_blocking pool to ensure it is now complete
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
.await;
}
other => unreachable!("{other:?}"),
}
// only way to query if a layer is resident is to acquire a ResidentLayer instance.
// Layer::keep_resident never downloads, but it might initialize if the layer file is found
// downloaded locally.
let none = layer.keep_resident().await.unwrap();
assert!(
none.is_none(),
"Expected none, because eviction removed the local file, found: {none:?}"
);
// plain downloading is rarely needed
layer
.download_and_keep_resident()
.instrument(download_span)
.await
.unwrap();
// last important part is deletion on drop: gc and compaction use it for compacted L0 layers
// or fully garbage collected layers. deletion means deleting the local file, and scheduling a
// deletion of the already unlinked from index_part.json remote file.
//
// marking a layer to be deleted on drop is irreversible; there is no technical reason against
// reversiblity, but currently it is not needed so it is not provided.
layer.delete_on_drop();
let path = layer.local_path().to_owned();
// wait_drop produces an unconnected to Layer future which will resolve when the
// LayerInner::drop has completed.
let mut wait_drop = std::pin::pin!(layer.wait_drop());
// paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing
// until here
tokio::time::pause();
tokio::time::timeout(ADVANCE, &mut wait_drop)
.await
.expect_err("should had timed out because two strong references exist");
tokio::fs::metadata(&path)
.await
.expect("the local layer file still exists");
let rtc = timeline.remote_client.as_ref().unwrap();
{
let layers = &[layer];
let mut g = timeline.layers.write().await;
g.finish_gc_timeline(layers);
// this just updates the remote_physical_size for demonstration purposes
rtc.schedule_gc_update(layers).unwrap();
}
// when strong references are dropped, the file is deleted and remote deletion is scheduled
wait_drop.await;
let e = tokio::fs::metadata(&path)
.await
.expect_err("the local file is deleted");
assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
rtc.wait_completion().await.unwrap();
assert_eq!(rtc.get_remote_physical_size(), 0);
}
use crate::task_mgr::BACKGROUND_RUNTIME;
use crate::tenant::harness::TenantHarness;
/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
/// time. Now both of them complete per Arc drop semantics.
@@ -201,10 +41,10 @@ async fn evict_and_wait_on_wanted_deleted() {
let resident = layer.keep_resident().await.unwrap();
{
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
// drive the future to await on the status channel
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
@@ -275,10 +115,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
let resident = layer.keep_resident().await.unwrap();
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
// drive the future to await on the status channel
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
.await
.expect_err("should had been a timeout since we are holding the layer resident");
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
@@ -298,7 +138,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
// because the keep_resident check alters wanted evicted without sending a message, we will
// never get completed
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
.await
.expect("no timeout, because keep_resident re-initialized")
.expect_err("eviction should not have succeeded because re-initialized");
@@ -318,10 +158,9 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
.sum::<u64>()
);
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
// advance to the wait on the queue
tokio::time::timeout(ADVANCE, &mut second_eviction)
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
.await
.expect_err("timeout because spawn_blocking is clogged");
@@ -332,12 +171,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
helper.release().await;
// the second_eviction gets to run here
//
// synchronize to be *strictly* after the second_eviction spawn_blocking run
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
tokio::time::timeout(ADVANCE, &mut second_eviction)
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
.await
.expect("eviction goes through now that spawn_blocking is unclogged")
.expect("eviction should succeed, because version matches");
@@ -427,49 +261,3 @@ impl SpawnBlockingPoolHelper {
.await
}
}
#[test]
fn spawn_blocking_pool_helper_actually_works() {
// create a custom runtime for which we know and control how many blocking threads it has
//
// because the amount is not configurable for our helper, expect the same amount as
// BACKGROUND_RUNTIME using the tokio defaults would have.
let rt = tokio::runtime::Builder::new_current_thread()
.max_blocking_threads(512)
.enable_all()
.build()
.unwrap();
let handle = rt.handle();
rt.block_on(async move {
// this will not return until all threads are spun up and actually executing the code
// waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
println!("consumed");
let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || {
// this will not get to run before we release
}));
println!("spawned");
tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh)
.await
.expect_err("the task should not have gotten to run yet");
println!("tried to join");
consumed.release().await;
println!("released");
tokio::time::timeout(std::time::Duration::from_secs(1), jh)
.await
.expect("no timeout")
.expect("no join error");
println!("joined");
});
}

View File

@@ -10,7 +10,7 @@ mod walreceiver;
use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use camino::Utf8Path;
use camino::{Utf8Path, Utf8PathBuf};
use enumset::EnumSet;
use fail::fail_point;
use futures::stream::StreamExt;
@@ -1512,14 +1512,10 @@ impl Timeline {
return Ok(None);
};
// curl has this by default
let timeout = std::time::Duration::from_secs(120);
match local_layer.evict_and_wait(timeout).await {
match local_layer.evict_and_wait().await {
Ok(()) => Ok(Some(true)),
Err(EvictionError::NotFound) => Ok(Some(false)),
Err(EvictionError::Downloaded) => Ok(Some(false)),
Err(EvictionError::Timeout) => Ok(Some(false)),
}
}
}
@@ -3422,10 +3418,26 @@ impl Timeline {
let _g = span.entered();
let new_delta =
Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
let new_delta_path = new_delta.local_path().to_owned();
// The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
// We just need to fsync the directory in which these inodes are linked,
// which we know to be the timeline directory.
// Sync it to disk.
//
// We must also fsync the timeline dir to ensure the directory entries for
// new layer files are durable.
//
// NB: timeline dir must be synced _after_ the file contents are durable.
// So, two separate fsyncs are required, they mustn't be batched.
//
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
// files to flush, the fsync overhead can be reduces as follows:
// 1. write them all to temporary file names
// 2. fsync them
// 3. rename to the final name
// 4. fsync the parent directory.
// Note that (1),(2),(3) today happen inside write_to_disk().
//
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
par_fsync::par_fsync(&[self_clone
.conf
.timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
@@ -3658,10 +3670,25 @@ impl Timeline {
}
}
// The writer.finish() above already did the fsync of the inodes.
// We just need to fsync the directory in which these inodes are linked,
// which we know to be the timeline directory.
if !image_layers.is_empty() {
// Sync the new layer to disk before adding it to the layer map, to make sure
// we don't garbage collect something based on the new layer, before it has
// reached the disk.
//
// We must also fsync the timeline dir to ensure the directory entries for
// new layer files are durable
//
// Compaction creates multiple image layers. It would be better to create them all
// and fsync them all in parallel.
let all_paths = image_layers
.iter()
.map(|layer| layer.local_path().to_owned())
.collect::<Vec<_>>();
par_fsync::par_fsync_async(&all_paths)
.await
.context("fsync of newly created layer files")?;
if !all_paths.is_empty() {
par_fsync::par_fsync_async(&[self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id)])
@@ -4248,12 +4275,22 @@ impl Timeline {
}
}
// The writer.finish() above already did the fsync of the inodes.
// We just need to fsync the directory in which these inodes are linked,
// which we know to be the timeline directory.
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
let layer_paths: Vec<Utf8PathBuf> = new_layers
.iter()
.map(|l| l.local_path().to_owned())
.collect();
// Fsync all the layer files and directory using multiple threads to
// minimize latency.
par_fsync::par_fsync_async(&layer_paths)
.await
.context("fsync all new layers")?;
let timeline_dir = self
.conf
.timeline_path(&self.tenant_shard_id, &self.timeline_id);
par_fsync::par_fsync_async(&[timeline_dir])
.await
.context("fsync of timeline dir")?;
@@ -5120,7 +5157,8 @@ mod tests {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let (tenant, ctx) = harness.load().await;
let ctx = any_context();
let tenant = harness.do_try_load(&ctx).await.unwrap();
let timeline = tenant
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
.await
@@ -5134,10 +5172,8 @@ mod tests {
.expect("should had been resident")
.drop_eviction_guard();
let forever = std::time::Duration::from_secs(120);
let first = layer.evict_and_wait(forever);
let second = layer.evict_and_wait(forever);
let first = async { layer.evict_and_wait().await };
let second = async { layer.evict_and_wait().await };
let (first, second) = tokio::join!(first, second);
@@ -5156,6 +5192,12 @@ mod tests {
}
}
fn any_context() -> crate::context::RequestContext {
use crate::context::*;
use crate::task_mgr::*;
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
}
async fn find_some_layer(timeline: &Timeline) -> Layer {
let layers = timeline.layers.read().await;
let desc = layers

View File

@@ -75,13 +75,14 @@ impl Timeline {
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
let ctx_adaptor = RequestContextAdaptor(ctx.clone());
pageserver_compaction::compact_tiered::compact_tiered(
&mut adaptor,
end_lsn,
target_file_size,
fanout,
ctx,
&ctx_adaptor,
)
.await?;
@@ -142,13 +143,13 @@ impl CompactionJobExecutor for TimelineAdaptor {
type DeltaLayer = ResidentDeltaLayer;
type ImageLayer = ResidentImageLayer;
type RequestContext = crate::context::RequestContext;
type RequestContext = RequestContextAdaptor;
async fn get_layers(
&mut self,
key_range: &Range<Key>,
lsn_range: &Range<Lsn>,
_ctx: &RequestContext,
_ctx: &RequestContextAdaptor,
) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
self.flush_updates().await?;
@@ -169,7 +170,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
&mut self,
key_range: &Range<Key>,
lsn: Lsn,
_ctx: &RequestContext,
_ctx: &RequestContextAdaptor,
) -> anyhow::Result<Vec<Range<Key>>> {
if lsn == self.keyspace.0 {
Ok(pageserver_compaction::helpers::intersect_keyspace(
@@ -205,7 +206,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
&mut self,
lsn: Lsn,
key_range: &Range<Key>,
ctx: &RequestContext,
ctx: &RequestContextAdaptor,
) -> anyhow::Result<()> {
Ok(self.create_image_impl(lsn, key_range, ctx).await?)
}
@@ -215,7 +216,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
lsn_range: &Range<Lsn>,
key_range: &Range<Key>,
input_layers: &[ResidentDeltaLayer],
ctx: &RequestContext,
ctx: &RequestContextAdaptor,
) -> anyhow::Result<()> {
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
@@ -286,7 +287,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
async fn delete_layer(
&mut self,
layer: &OwnArc<PersistentLayerDesc>,
_ctx: &RequestContext,
_ctx: &RequestContextAdaptor,
) -> anyhow::Result<()> {
self.layers_to_delete.push(layer.clone().0);
Ok(())
@@ -298,7 +299,7 @@ impl TimelineAdaptor {
&mut self,
lsn: Lsn,
key_range: &Range<Key>,
ctx: &RequestContext,
ctx: &RequestContextAdaptor,
) -> Result<(), PageReconstructError> {
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
@@ -360,7 +361,17 @@ impl TimelineAdaptor {
}
}
impl CompactionRequestContext for crate::context::RequestContext {}
pub struct RequestContextAdaptor(pub RequestContext);
impl std::ops::Deref for RequestContextAdaptor {
type Target = RequestContext;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl CompactionRequestContext for RequestContextAdaptor {}
#[derive(Debug, Clone)]
pub struct OwnArc<T>(pub Arc<T>);
@@ -438,7 +449,10 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
type DeltaEntry<'a> = DeltaEntry<'a>;
async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
async fn load_keys<'a>(
&self,
ctx: &RequestContextAdaptor,
) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
self.0.load_keys(ctx).await
}
}

View File

@@ -204,7 +204,6 @@ impl Timeline {
evicted: usize,
errors: usize,
not_evictable: usize,
timeouts: usize,
#[allow(dead_code)]
skipped_for_shutdown: usize,
}
@@ -268,11 +267,7 @@ impl Timeline {
let layer = guard.drop_eviction_guard();
if no_activity_for > p.threshold {
// this could cause a lot of allocations in some cases
js.spawn(async move {
layer
.evict_and_wait(std::time::Duration::from_secs(5))
.await
});
js.spawn(async move { layer.evict_and_wait().await });
stats.candidates += 1;
}
}
@@ -285,9 +280,6 @@ impl Timeline {
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
stats.not_evictable += 1;
}
Ok(Err(EvictionError::Timeout)) => {
stats.timeouts += 1;
}
Err(je) if je.is_cancelled() => unreachable!("not used"),
Err(je) if je.is_panic() => {
/* already logged */
@@ -303,8 +295,7 @@ impl Timeline {
stats = join_all => {
if stats.candidates == stats.not_evictable {
debug!(stats=?stats, "eviction iteration complete");
} else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
// reminder: timeouts are not eviction cancellations
} else if stats.errors > 0 || stats.not_evictable > 0 {
warn!(stats=?stats, "eviction iteration complete");
} else {
info!(stats=?stats, "eviction iteration complete");

View File

@@ -1667,6 +1667,8 @@ mod tests {
use super::*;
use crate::tenant::harness::*;
use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
use crate::tenant::Timeline;
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
use postgres_ffi::RELSEG_SIZE;
use crate::DEFAULT_PG_VERSION;

View File

@@ -262,7 +262,7 @@ impl PostgresRedoManager {
// next request will launch a new one.
if let Err(e) = result.as_ref() {
error!(
"error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
records.len(),
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
records.last().map(|p| p.0).unwrap_or(Lsn(0)),

View File

@@ -252,6 +252,8 @@ mod test {
use super::*;
use std::collections::HashMap;
use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
/// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
#[test]
fn apply_aux_file_deltas() -> anyhow::Result<()> {

View File

@@ -1,5 +1,7 @@
use tracing;
use tracing::error;
use tracing::info;
use tracing::instrument;
use tracing::{error, info};
use crate::metrics::WalRedoKillCause;
use crate::metrics::WAL_REDO_PROCESS_COUNTERS;

View File

@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
SHLIB_LINK = -lcurl
EXTENSION = neon
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
PGFILEDESC = "neon - cloud storage for PostgreSQL"
EXTRA_CLEAN = \

View File

@@ -25,8 +25,6 @@
#include "funcapi.h"
#include "miscadmin.h"
#include "pagestore_client.h"
#include "common/hashfn.h"
#include "lib/hyperloglog.h"
#include "pgstat.h"
#include "postmaster/bgworker.h"
#include RELFILEINFO_HDR
@@ -62,7 +60,6 @@
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
#define MB ((uint64)1024*1024)
#define HYPER_LOG_LOG_BIT_WIDTH 10
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
typedef struct FileCacheEntry
@@ -87,8 +84,6 @@ typedef struct FileCacheControl
uint64 writes;
dlist_head lru; /* double linked list for LRU replacement
* algorithm */
hyperLogLogState wss_estimation; /* estimation of wroking set size */
uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
} FileCacheControl;
static HTAB *lfc_hash;
@@ -237,14 +232,6 @@ lfc_shmem_startup(void)
lfc_ctl->writes = 0;
dlist_init(&lfc_ctl->lru);
/* Initialize hyper-log-log structure for estimating working set size */
initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
/* We need hashes in shared memory */
pfree(lfc_ctl->wss_estimation.hashesArr);
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
/* Recreate file cache on restart */
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
if (fd < 0)
@@ -542,11 +529,6 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
}
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
/* Approximate working set */
tag.blockNum = blkno;
addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
{
/* Page is not cached */
@@ -985,21 +967,3 @@ local_cache_pages(PG_FUNCTION_ARGS)
else
SRF_RETURN_DONE(funcctx);
}
PG_FUNCTION_INFO_V1(approximate_working_set_size);
Datum
approximate_working_set_size(PG_FUNCTION_ARGS)
{
int32 dc = -1;
if (lfc_size_limit != 0)
{
bool reset = PG_GETARG_BOOL(0);
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
if (reset)
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
LWLockRelease(lfc_lock);
}
PG_RETURN_INT32(dc);
}

View File

@@ -1,9 +0,0 @@
\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit
CREATE FUNCTION approximate_working_set_size(reset bool)
RETURNS integer
AS 'MODULE_PATHNAME', 'approximate_working_set_size'
LANGUAGE C PARALLEL SAFE;
GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor;

View File

@@ -1,6 +1,6 @@
# neon extension
comment = 'cloud storage for PostgreSQL'
default_version = '1.3'
default_version = '1.2'
module_pathname = '$libdir/neon'
relocatable = true
trusted = true

View File

@@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled;
use tokio::net::TcpListener;
use anyhow::{anyhow, bail, ensure, Context};
use clap::Arg;
use clap::{self, Arg};
use futures::TryFutureExt;
use proxy::console::messages::MetricsAuxInfo;
use proxy::stream::{PqStream, Stream};

View File

@@ -358,7 +358,8 @@ impl Cache for ProjectInfoCacheImpl {
#[cfg(test)]
mod tests {
use super::*;
use crate::scram::ServerSecret;
use crate::{console::AuthSecret, scram::ServerSecret};
use std::{sync::Arc, time::Duration};
#[tokio::test]
async fn test_project_info_cache_settings() {

View File

@@ -1,4 +1,4 @@
use serde::{Deserialize, Serialize};
use serde::Deserialize;
use std::fmt;
use crate::auth::IpPattern;
@@ -98,16 +98,7 @@ pub struct MetricsAuxInfo {
pub endpoint_id: EndpointId,
pub project_id: ProjectId,
pub branch_id: BranchId,
pub cold_start_info: Option<ColdStartInfo>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "snake_case")]
pub enum ColdStartInfo {
Unknown = 0,
Warm = 1,
PoolHit = 2,
PoolMiss = 3,
pub is_cold_start: Option<bool>,
}
#[cfg(test)]
@@ -120,7 +111,6 @@ mod tests {
"endpoint_id": "endpoint",
"project_id": "project",
"branch_id": "branch",
"cold_start_info": "unknown",
})
}

View File

@@ -4,7 +4,7 @@ use crate::{
};
use anyhow::Context;
use once_cell::sync::Lazy;
use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
use std::{convert::Infallible, future};
use tokio::net::{TcpListener, TcpStream};

View File

@@ -9,7 +9,7 @@ use tracing::{field::display, info_span, Span};
use uuid::Uuid;
use crate::{
console::messages::{ColdStartInfo, MetricsAuxInfo},
console::messages::MetricsAuxInfo,
error::ErrorKind,
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
BranchId, DbName, EndpointId, ProjectId, RoleName,
@@ -42,7 +42,7 @@ pub struct RequestMonitoring {
error_kind: Option<ErrorKind>,
pub(crate) auth_method: Option<AuthMethod>,
success: bool,
cold_start_info: Option<ColdStartInfo>,
is_cold_start: Option<bool>,
// extra
// This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -91,7 +91,7 @@ impl RequestMonitoring {
error_kind: None,
auth_method: None,
success: false,
cold_start_info: None,
is_cold_start: None,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol),
@@ -115,7 +115,7 @@ impl RequestMonitoring {
self.set_endpoint_id(x.endpoint_id);
self.branch = Some(x.branch_id);
self.project = Some(x.project_id);
self.cold_start_info = x.cold_start_info;
self.is_cold_start = x.is_cold_start;
}
pub fn set_project_id(&mut self, project_id: ProjectId) {

View File

@@ -93,7 +93,7 @@ struct RequestData {
/// Or if we make it to proxy_pass
success: bool,
/// Indicates if the cplane started the new compute node for this request.
cold_start_info: Option<String>,
is_cold_start: Option<bool>,
/// Tracks time from session start (HTTP request/libpq TCP handshake)
/// Through to success/failure
duration_us: u64,
@@ -121,10 +121,7 @@ impl From<RequestMonitoring> for RequestData {
region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
success: value.success,
cold_start_info: value
.cold_start_info
.as_ref()
.map(|x| serde_json::to_string(x).unwrap_or_default()),
is_cold_start: value.is_cold_start,
duration_us: SystemTime::from(value.first_packet)
.elapsed()
.unwrap_or_default()
@@ -458,7 +455,7 @@ mod tests {
region: "us-east-1",
error: None,
success: rng.gen(),
cold_start_info: Some("no".into()),
is_cold_start: Some(true),
duration_us: rng.gen_range(0..30_000_000),
}
}
@@ -528,16 +525,16 @@ mod tests {
assert_eq!(
file_stats,
[
(1314406, 3, 6000),
(1314399, 3, 6000),
(1314459, 3, 6000),
(1314416, 3, 6000),
(1314546, 3, 6000),
(1314388, 3, 6000),
(1314180, 3, 6000),
(1314416, 3, 6000),
(438359, 1, 2000)
]
(1315032, 3, 6000),
(1315025, 3, 6000),
(1315085, 3, 6000),
(1315042, 3, 6000),
(1315172, 3, 6000),
(1315014, 3, 6000),
(1314806, 3, 6000),
(1315042, 3, 6000),
(438563, 1, 2000)
],
);
tmpdir.close().unwrap();
@@ -566,12 +563,12 @@ mod tests {
assert_eq!(
file_stats,
[
(1220668, 5, 10000),
(1226818, 5, 10000),
(1228612, 5, 10000),
(1227974, 5, 10000),
(1219252, 5, 10000)
]
(1220433, 5, 10000),
(1226583, 5, 10000),
(1228377, 5, 10000),
(1227739, 5, 10000),
(1219017, 5, 10000)
],
);
tmpdir.close().unwrap();
@@ -602,12 +599,12 @@ mod tests {
assert_eq!(
file_stats,
[
(1206315, 5, 10000),
(1206046, 5, 10000),
(1206339, 5, 10000),
(1206327, 5, 10000),
(1206582, 5, 10000)
]
(1206080, 5, 10000),
(1205811, 5, 10000),
(1206104, 5, 10000),
(1206092, 5, 10000),
(1206347, 5, 10000)
],
);
tmpdir.close().unwrap();
@@ -631,16 +628,16 @@ mod tests {
assert_eq!(
file_stats,
[
(1314406, 3, 6000),
(1314399, 3, 6000),
(1314459, 3, 6000),
(1314416, 3, 6000),
(1314546, 3, 6000),
(1314388, 3, 6000),
(1314180, 3, 6000),
(1314416, 3, 6000),
(438359, 1, 2000)
]
(1315032, 3, 6000),
(1315025, 3, 6000),
(1315085, 3, 6000),
(1315042, 3, 6000),
(1315172, 3, 6000),
(1315014, 3, 6000),
(1314806, 3, 6000),
(1315042, 3, 6000),
(438563, 1, 2000)
],
);
tmpdir.close().unwrap();
@@ -676,7 +673,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!(
file_stats,
[(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
[(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
);
tmpdir.close().unwrap();

View File

@@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken
use crate::console::{self, CachedNodeInfo, NodeInfo};
use crate::error::ErrorKind;
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
use crate::{http, sasl, scram};
use crate::{auth, http, sasl, scram};
use anyhow::{bail, Context};
use async_trait::async_trait;
use rstest::rstest;

View File

@@ -11,6 +11,7 @@ use bytes::{Bytes, BytesMut};
use futures::{SinkExt, StreamExt};
use postgres_protocol::message::frontend;
use tokio::io::{AsyncReadExt, DuplexStream};
use tokio_postgres::config::SslMode;
use tokio_postgres::tls::TlsConnect;
use tokio_util::codec::{Decoder, Encoder};

View File

@@ -667,6 +667,7 @@ impl<C: ClientInnerExt> Drop for Client<C> {
#[cfg(test)]
mod tests {
use env_logger;
use std::{mem, sync::atomic::AtomicBool};
use super::*;

View File

@@ -19,6 +19,8 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
use crate::SafeKeeperConf;
use std::convert::TryInto;
pub const SK_MAGIC: u32 = 0xcafeceefu32;
pub const SK_FORMAT_VERSION: u32 = 7;
@@ -217,9 +219,12 @@ impl Storage for FileStorage {
#[cfg(test)]
mod test {
use super::FileStorage;
use super::*;
use crate::SafeKeeperConf;
use anyhow::Result;
use tokio::fs;
use utils::lsn::Lsn;
use utils::{id::TenantTimelineId, lsn::Lsn};
fn stub_conf() -> SafeKeeperConf {
let workdir = camino_tempfile::tempdir().unwrap().into_path();

View File

@@ -2,7 +2,8 @@
//! protocol commands.
use anyhow::Context;
use std::str::{self, FromStr};
use std::str::FromStr;
use std::str::{self};
use std::sync::Arc;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{debug, info, info_span, Instrument};
@@ -15,8 +16,8 @@ use crate::safekeeper::Term;
use crate::timeline::TimelineError;
use crate::wal_service::ConnectionId;
use crate::{GlobalTimelines, SafeKeeperConf};
use postgres_backend::PostgresBackend;
use postgres_backend::QueryError;
use postgres_backend::{self, PostgresBackend};
use postgres_ffi::PG_TLI;
use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
use regex::Regex;

View File

@@ -2180,11 +2180,6 @@ class NeonAttachmentService(MetricsGetter):
self.stop(immediate=True)
@dataclass
class LogCursor:
_line_no: int
class NeonPageserver(PgProtocol):
"""
An object representing a running pageserver.
@@ -2348,18 +2343,7 @@ class NeonPageserver(PgProtocol):
value = self.http_client().get_metric_value(metric)
assert value == 0, f"Nonzero {metric} == {value}"
def assert_log_contains(
self, pattern: str, offset: None | LogCursor = None
) -> Tuple[str, LogCursor]:
"""Convenient for use inside wait_until()"""
res = self.log_contains(pattern, offset=offset)
assert res is not None
return res
def log_contains(
self, pattern: str, offset: None | LogCursor = None
) -> Optional[Tuple[str, LogCursor]]:
def log_contains(self, pattern: str) -> Optional[str]:
"""Check that the pageserver log contains a line that matches the given regex"""
logfile = self.workdir / "pageserver.log"
if not logfile.exists():
@@ -2373,17 +2357,12 @@ class NeonPageserver(PgProtocol):
# no guarantee it is already present in the log file. This hasn't
# been a problem in practice, our python tests are not fast enough
# to hit that race condition.
skip_until_line_no = 0 if offset is None else offset._line_no
cur_line_no = 0
with logfile.open("r") as f:
for line in f:
if cur_line_no < skip_until_line_no:
cur_line_no += 1
continue
if contains_re.search(line):
# found it!
cur_line_no += 1
return (line, LogCursor(cur_line_no))
return line
return None
def tenant_attach(

View File

@@ -286,11 +286,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
self.verbose_error(res)
def tenant_location_conf(
self,
tenant_id: Union[TenantId, TenantShardId],
location_conf=dict[str, Any],
flush_ms=None,
lazy: Optional[bool] = None,
self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
):
body = location_conf.copy()
body["tenant_id"] = str(tenant_id)
@@ -299,9 +295,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
if flush_ms is not None:
params["flush_ms"] = str(flush_ms)
if lazy is not None:
params["lazy"] = "true" if lazy else "false"
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
json=body,

View File

@@ -20,7 +20,7 @@ def assert_tenant_state(
tenant: TenantId,
expected_state: str,
message: Optional[str] = None,
) -> None:
):
tenant_status = pageserver_http.tenant_status(tenant)
log.info(f"tenant_status: {tenant_status}")
assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
@@ -206,8 +206,8 @@ def wait_for_last_record_lsn(
return current_lsn
if i % 10 == 0:
log.info(
"{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
tenant, timeline, lsn, current_lsn, i + 1
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
lsn, current_lsn, i + 1
)
)
time.sleep(0.1)
@@ -292,7 +292,7 @@ def timeline_delete_wait_completed(
iterations: int = 20,
interval: Optional[float] = None,
**delete_args,
) -> None:
):
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
@@ -302,7 +302,7 @@ def assert_prefix_empty(
remote_storage: Optional[RemoteStorage],
prefix: Optional[str] = None,
allowed_postfix: Optional[str] = None,
) -> None:
):
assert remote_storage is not None
response = list_prefix(remote_storage, prefix)
keys = response["KeyCount"]

View File

@@ -252,16 +252,6 @@ class S3Storage:
log.info(f"deleted {cnt} objects from remote storage")
def tenant_path(self, tenant_id: TenantId) -> str:
return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
def heatmap_key(self, tenant_id: TenantId) -> str:
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
def heatmap_content(self, tenant_id: TenantId):
r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
return json.loads(r["Body"].read().decode("utf-8"))
RemoteStorage = Union[LocalFsStorage, S3Storage]

View File

@@ -369,12 +369,7 @@ def start_in_background(
return spawned_process
WaitUntilRet = TypeVar("WaitUntilRet")
def wait_until(
number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
) -> WaitUntilRet:
def wait_until(number_of_iterations: int, interval: float, func: Fn):
"""
Wait until 'func' returns successfully, without exception. Returns the
last return value from the function.
@@ -392,18 +387,6 @@ def wait_until(
raise Exception("timed out while waiting for %s" % func) from last_exception
def assert_eq(a, b) -> None:
assert a == b
def assert_gt(a, b) -> None:
assert a > b
def assert_ge(a, b) -> None:
assert a >= b
def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
"""
Fast way to populate data.

View File

@@ -63,11 +63,10 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
]
)
wait_until(
50,
0.1,
lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
)
def log_contains_bad_request():
env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
wait_until(50, 0.1, log_contains_bad_request)
def test_null_body(negative_env: NegativeTests):

View File

@@ -200,7 +200,7 @@ class EvictionEnv:
tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
def statvfs_called():
pageserver.assert_log_contains(".*running mocked statvfs.*")
assert pageserver.log_contains(".*running mocked statvfs.*")
# we most likely have already completed multiple runs
wait_until(10, 1, statvfs_called)
@@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
assert actual_change >= target, "eviction must always evict more than target"
time.sleep(1) # give log time to flush
env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE)
assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
@@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
@@ -801,9 +801,10 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
wait_until(
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
)
def relieved_log_message():
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
wait_until(10, 1, relieved_log_message)
def less_than_max_usage_pct():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
@@ -844,9 +845,10 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
)
wait_until(
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
)
def relieved_log_message():
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
wait_until(10, 1, relieved_log_message)
def more_than_min_avail_bytes_freed():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)

View File

@@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
time.sleep(10) # let compaction to be performed
env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
assert env.pageserver.log_contains("compact-level0-phase1-return-same")
def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):

View File

@@ -1,84 +0,0 @@
from pathlib import Path
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
env = neon_simple_env
cache_dir = Path(env.repo_dir) / "file_cache"
cache_dir.mkdir(exist_ok=True)
branchname = "test_explain_with_lfc_stats"
env.neon_cli.create_branch(branchname, "empty")
log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
endpoint = env.endpoints.create_start(
branchname,
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='128MB'",
"neon.file_cache_size_limit='64MB'",
],
)
cur = endpoint.connect().cursor()
log.info(f"preparing some data in {endpoint.connstr()}")
ddl = """
CREATE TABLE pgbench_accounts (
aid bigint NOT NULL,
bid integer,
abalance integer,
filler character(84),
-- more web-app like columns
text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5),
jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
)
WITH (fillfactor='100');
"""
cur.execute(ddl)
cur.execute(
"insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
)
log.info(f"warming up caches with sequential scan in {endpoint.connstr()}")
cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0")
log.info("running explain analyze without LFC values to verify they do not show up in the plan")
cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0")
rows = cur.fetchall()
plan = "\n".join(r[0] for r in rows)
log.debug(plan)
assert "Seq Scan on pgbench_accounts" in plan
assert "Buffers: shared hit" in plan
assert "File cache: hits=" not in plan
log.info("running explain analyze WITH LFC values to verify they do now show up")
cur.execute(
"EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0"
)
rows = cur.fetchall()
plan = "\n".join(r[0] for r in rows)
log.debug(plan)
assert "Seq Scan on pgbench_accounts" in plan
assert "Buffers: shared hit" in plan
assert "File cache: hits=" in plan
log.info("running explain analyze WITH LFC values to verify json output")
cur.execute(
"EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0"
)
jsonplan = cur.fetchall()[0][0]
log.debug(jsonplan)
# Directly access the 'Plan' part of the first element of the JSON array
plan_details = jsonplan[0]["Plan"]
# Extract "File Cache Hits" and "File Cache Misses"
file_cache_hits = plan_details.get("File Cache Hits")
file_cache_misses = plan_details.get("File Cache Misses")
# Now you can assert the values
assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}"
assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}"

View File

@@ -184,13 +184,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
# NB: the layer file is unlinked index part now, but, because we made the delete
# operation stuck, the layer file itself is still in the remote_storage
wait_until(
10,
0.5,
lambda: env.pageserver.assert_log_contains(
f".*{tenant_id}.*at failpoint.*{failpoint_name}"
),
)
def delete_at_pause_point():
assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
wait_until(10, 0.5, delete_at_pause_point)
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
)

View File

@@ -1,74 +0,0 @@
from pathlib import Path
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import query_scalar
def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
env = neon_simple_env
cache_dir = Path(env.repo_dir) / "file_cache"
cache_dir.mkdir(exist_ok=True)
branchname = "test_approximate_working_set_size"
env.neon_cli.create_branch(branchname, "empty")
log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
endpoint = env.endpoints.create_start(
branchname,
config_lines=[
"shared_buffers='1MB'",
f"neon.file_cache_path='{cache_dir}/file.cache'",
"neon.max_file_cache_size='128MB'",
"neon.file_cache_size_limit='64MB'",
],
)
cur = endpoint.connect().cursor()
cur.execute("create extension neon")
log.info(f"preparing some data in {endpoint.connstr()}")
ddl = """
CREATE TABLE pgbench_accounts (
aid bigint NOT NULL,
bid integer,
abalance integer,
filler character(84),
-- more web-app like columns
text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5),
jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
)
WITH (fillfactor='100');
"""
cur.execute(ddl)
# prepare index access below
cur.execute(
"ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)"
)
cur.execute(
"insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
)
# ensure correct query plans and stats
cur.execute("vacuum ANALYZE pgbench_accounts")
# determine table size - working set should approximate table size after sequential scan
pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'")
log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero")
cur.execute("select approximate_working_set_size(true)")
cur.execute(
'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb'
)
# verify working set size after sequential scan matches table size and reset working set for next test
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
log.info(f"working set size after sequential scan on pgbench_accounts {blocks}")
assert pages * 0.8 < blocks < pages * 1.2
# run a few point queries with index lookup
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 4242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 54242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
# verify working set size after some index access of a few select pages only
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
log.info(f"working set size after some index access of a few select pages only {blocks}")
assert blocks < 10

View File

@@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
def assert_logged():
if not log_expected:
return
env.pageserver.assert_log_contains(f".*{msg_id}.*")
assert env.pageserver.log_contains(f".*{msg_id}.*")
wait_until(10, 0.5, assert_logged)

View File

@@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
# IMPORTANT:
# If the version has changed, the test should be updated.
# Ensure that the default version is also updated in the neon.control file
assert cur.fetchone() == ("1.3",)
assert cur.fetchone() == ("1.2",)
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
res = cur.fetchall()
log.info(res)

View File

@@ -1,12 +1,9 @@
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.pg_version import PgVersion, skip_on_postgres
from fixtures.pg_version import PgVersion
from fixtures.utils import wait_until
@skip_on_postgres(
PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
)
def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
env = neon_simple_env
env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")

View File

@@ -432,7 +432,7 @@ def test_deletion_queue_recovery(
main_pageserver.start()
def assert_deletions_submitted(n: int) -> None:
def assert_deletions_submitted(n: int):
assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
# After restart, issue a flush to kick the deletion frontend to do recovery.

View File

@@ -1,4 +1,3 @@
import json
import random
from pathlib import Path
from typing import Any, Dict, Optional
@@ -11,7 +10,7 @@ from fixtures.pageserver.utils import (
poll_for_remote_storage_iterations,
tenant_delete_wait_completed,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from fixtures.utils import wait_until
from fixtures.workload import Workload
@@ -437,7 +436,6 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
@@ -493,35 +491,18 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
# Do evictions on attached pageserver, check secondary follows along
# ==================================================================
try:
log.info("Evicting a layer...")
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
log.info(f"Victim layer: {layer_to_evict.name}")
ps_attached.http_client().evict_layer(
tenant_id, timeline_id, layer_name=layer_to_evict.name
)
log.info("Evicting a layer...")
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
log.info("Synchronizing after eviction...")
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id)
heatmap_layers = set(
layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
)
assert layer_to_evict.name not in heatmap_layers
assert some_other_layer.name in heatmap_layers
log.info("Synchronizing after eviction...")
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
ps_secondary.http_client().tenant_secondary_download(tenant_id)
ps_secondary.http_client().tenant_secondary_download(tenant_id)
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
except:
# On assertion failures, log some details to help with debugging
heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}")
raise
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
# Scrub the remote storage
# ========================

View File

@@ -1,110 +0,0 @@
import asyncio
import time
from typing import Tuple
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
tenant_get_shards,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import wait_for_last_record_lsn
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import wait_until
TIMELINE_COUNT = 10
ENTRIES_PER_TIMELINE = 10_000
CHECKPOINT_TIMEOUT_SECONDS = 60
TENANT_CONF = {
# Large `checkpoint_distance` effectively disables size
# based checkpointing.
"checkpoint_distance": f"{2 * 1024 ** 3}",
"checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
}
async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
with env.endpoints.create_start("main", tenant_id=tenant) as ep:
conn = await ep.connect_async()
try:
await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
await conn.execute(
f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
)
finally:
await conn.close(timeout=10)
last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
return tenant, timeline, last_flush_lsn
async def workload(
env: NeonEnv, timelines: int, entries: int
) -> list[Tuple[TenantId, TimelineId, Lsn]]:
workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
return await asyncio.gather(*workers)
def wait_until_pageserver_is_caught_up(
env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
):
for tenant, timeline, last_flush_lsn in last_flush_lsns:
shards = tenant_get_shards(env, tenant)
for tenant_shard_id, pageserver in shards:
waited = wait_for_last_record_lsn(
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
)
assert waited >= last_flush_lsn
def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
def query():
value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
assert value is not None
return value
# The metric gets initialised on the first update.
# Retry a few times, but return 0 if it's stable.
try:
return float(wait_until(3, 0.5, query))
except Exception:
return 0
@pytest.mark.parametrize("immediate_shutdown", [True, False])
def test_pageserver_small_inmemory_layers(
neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
):
"""
Test that open layers get flushed after the `checkpoint_timeout` config
and do not require WAL reingest upon restart.
The workload creates a number of timelines and writes some data to each,
but not enough to trigger flushes via the `checkpoint_distance` config.
"""
env = neon_env_builder.init_configs()
env.start()
last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
ps_http_client = env.pageserver.http_client()
total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
log.info("Sleeping for checkpoint timeout ...")
time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
env.pageserver.restart(immediate=immediate_shutdown)
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
leeway = total_wal_ingested_before_restart * 5 / 100
assert total_wal_ingested_after_restart <= leeway

View File

@@ -28,14 +28,7 @@ from fixtures.remote_storage import (
available_remote_storages,
)
from fixtures.types import Lsn, TenantId, TimelineId
from fixtures.utils import (
assert_eq,
assert_ge,
assert_gt,
print_gc_result,
query_scalar,
wait_until,
)
from fixtures.utils import print_gc_result, query_scalar, wait_until
from requests import ReadTimeout
@@ -127,10 +120,10 @@ def test_remote_storage_backup_and_restore(
log.info(f"upload of checkpoint {checkpoint_number} is done")
# Check that we had to retry the uploads
env.pageserver.assert_log_contains(
assert env.pageserver.log_contains(
".*failed to perform remote task UploadLayer.*, will retry.*"
)
env.pageserver.assert_log_contains(
assert env.pageserver.log_contains(
".*failed to perform remote task UploadMetadata.*, will retry.*"
)
@@ -299,9 +292,9 @@ def test_remote_storage_upload_queue_retries(
print_gc_result(gc_result)
assert gc_result["layers_removed"] > 0
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
# let all future operations queue up
configure_storage_sync_failpoints("return")
@@ -329,17 +322,17 @@ def test_remote_storage_upload_queue_retries(
churn_while_failpoints_active_thread.start()
# wait for churn thread's data to get stuck in the upload queue
wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)
# unblock churn operations
configure_storage_sync_failpoints("off")
# ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
# The churn thread doesn't make progress once it blocks on the first wait_completion() call,
# so, give it some time to wrap up.
@@ -891,23 +884,26 @@ def wait_upload_queue_empty(
wait_until(
2,
1,
lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
),
lambda: get_queued_count(
client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
)
== 0,
)
wait_until(
2,
1,
lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
),
lambda: get_queued_count(
client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
)
== 0,
)
wait_until(
2,
1,
lambda: assert_eq(
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
),
lambda: get_queued_count(
client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
)
== 0,
)

View File

@@ -116,7 +116,7 @@ def test_sharding_service_smoke(
# Marking a pageserver offline should migrate tenants away from it.
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
def node_evacuated(node_id: int) -> None:
def node_evacuated(node_id: int):
counts = get_node_shard_counts(env, tenant_ids)
assert counts[node_id] == 0
@@ -146,8 +146,6 @@ def test_sharding_service_smoke(
for tid in tenant_ids:
tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
env.attachment_service.consistency_check()
# Set a scheduling policy on one node, create all the tenants, observe
# that the scheduling policy is respected.
env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
@@ -258,8 +256,9 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
env.attachment_service.consistency_check()
@pytest.mark.parametrize("warm_up", [True, False])
def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
def test_sharding_service_onboarding(
neon_env_builder: NeonEnvBuilder,
):
"""
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
which provides the /location_config API. This is similar to creating a tenant,
@@ -307,23 +306,6 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
},
)
if warm_up:
origin_ps.http_client().tenant_heatmap_upload(tenant_id)
# We expect to be called via live migration code, which may try to configure the tenant into secondary
# mode before attaching it.
virtual_ps_http.tenant_location_conf(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
"generation": None,
},
)
virtual_ps_http.tenant_secondary_download(tenant_id)
# Call into attachment service to onboard the tenant
generation += 1
virtual_ps_http.tenant_location_conf(
@@ -369,9 +351,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
assert len(dest_tenants) == 1
assert TenantId(dest_tenants[0]["id"]) == tenant_id
# sharding service advances generation by 1 when it first attaches. We started
# with a nonzero generation so this equality also proves that the generation
# was properly carried over during onboarding.
# sharding service advances generation by 1 when it first attaches
assert dest_tenants[0]["generation"] == generation + 1
# The onboarded tenant should survive a restart of sharding service
@@ -382,31 +362,6 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
dest_ps.stop()
dest_ps.start()
# Having onboarded via /location_config, we should also be able to update the
# TenantConf part of LocationConf, without inadvertently resetting the generation
modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
# The generation has moved on since we onboarded
assert generation != dest_tenant_before_conf_change["generation"]
virtual_ps_http.tenant_location_conf(
tenant_id,
{
"mode": "AttachedSingle",
"secondary_conf": None,
"tenant_conf": modified_tenant_conf,
# This is intentionally a stale generation
"generation": generation,
},
)
dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
assert (
dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
)
dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
env.attachment_service.consistency_check()
@@ -450,7 +405,7 @@ def test_sharding_service_compute_hook(
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
def node_evacuated(node_id: int) -> None:
def node_evacuated(node_id: int):
counts = get_node_shard_counts(env, [env.initial_tenant])
assert counts[node_id] == 0
@@ -712,41 +667,3 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
svc.request(
"POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
)
def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
"""
Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
supplying the whole LocationConf.
"""
env = neon_env_builder.init_start()
tenant_id = env.initial_tenant
http = env.attachment_service.pageserver_api()
default_value = "7days"
new_value = "1h"
http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
# Ensure the change landed on the storage controller
readback_controller = http.tenant_config(tenant_id)
assert readback_controller.effective_config["pitr_interval"] == new_value
assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
# Ensure the change made it down to the pageserver
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
assert readback_ps.effective_config["pitr_interval"] == new_value
assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
# Omitting a value clears it. This looks different in storage controller
# vs. pageserver API calls, because pageserver has defaults.
http.set_tenant_config(tenant_id, {})
readback_controller = http.tenant_config(tenant_id)
assert readback_controller.effective_config["pitr_interval"] is None
assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
assert readback_ps.effective_config["pitr_interval"] == default_value
assert "pitr_interval" not in readback_ps.tenant_specific_overrides
env.attachment_service.consistency_check()

View File

@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
"period": "20s",
"threshold": "23h",
}
assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
# restart the pageserver and ensure that the config is still correct
env.pageserver.stop()

View File

@@ -505,10 +505,10 @@ def test_tenant_delete_concurrent(
return ps_http.tenant_delete(tenant_id)
def hit_remove_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
def hit_run_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
with concurrent.futures.ThreadPoolExecutor() as executor:
background_200_req = executor.submit(delete_tenant)
@@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation(
Thread(target=timeline_create).start()
def hit_initdb_upload_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
wait_until(100, 0.1, hit_initdb_upload_failpoint)
def creation_connection_timed_out():
env.pageserver.assert_log_contains(
assert env.pageserver.log_contains(
"POST.*/timeline.* request was dropped before completing"
)
@@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation(
Thread(target=tenant_delete).start()
def deletion_arrived():
env.pageserver.assert_log_contains(
assert env.pageserver.log_contains(
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
)
@@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation(
)
# Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
env.pageserver.assert_log_contains(CANCELLED_ERROR)
assert env.pageserver.log_contains(CANCELLED_ERROR)
assert not env.pageserver.log_contains(
".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
)

Some files were not shown because too many files have changed in this diff Show More