mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-28 15:50:38 +00:00
Compare commits
6 Commits
release-pr
...
release-50
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1df0f69664 | ||
|
|
970066a914 | ||
|
|
1ebd3897c0 | ||
|
|
6460beffcd | ||
|
|
6f7f8958db | ||
|
|
936a00e077 |
8
Cargo.lock
generated
8
Cargo.lock
generated
@@ -25,9 +25,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.9"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f"
|
||||
checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"const-random",
|
||||
@@ -1389,9 +1389,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crc32c"
|
||||
version = "0.6.5"
|
||||
version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
|
||||
checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
|
||||
dependencies = [
|
||||
"rustc_version",
|
||||
]
|
||||
|
||||
@@ -230,8 +230,6 @@ postgres=# select * from t;
|
||||
> cargo neon stop
|
||||
```
|
||||
|
||||
More advanced usages can be found at [Control Plane and Neon Local](./control_plane/README.md).
|
||||
|
||||
#### Handling build failures
|
||||
|
||||
If you encounter errors during setting up the initial tenant, it's best to stop everything (`cargo neon stop`) and remove the `.neon` directory. Then fix the problems, and start the setup again.
|
||||
|
||||
@@ -18,6 +18,8 @@ use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use postgres::{Client, NoTls};
|
||||
use tokio;
|
||||
use tokio_postgres;
|
||||
use tracing::{debug, error, info, instrument, warn};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -71,7 +71,7 @@ More specifically, here is an example ext_index.json
|
||||
}
|
||||
}
|
||||
*/
|
||||
use anyhow::Result;
|
||||
use anyhow::{self, Result};
|
||||
use anyhow::{bail, Context};
|
||||
use bytes::Bytes;
|
||||
use compute_api::spec::RemoteExtSpec;
|
||||
|
||||
@@ -13,6 +13,8 @@ use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIErr
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use num_cpus;
|
||||
use serde_json;
|
||||
use tokio::task;
|
||||
use tracing::{error, info, warn};
|
||||
use tracing_utils::http::OtelName;
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
# Control Plane and Neon Local
|
||||
|
||||
This crate contains tools to start a Neon development environment locally. This utility can be used with the `cargo neon` command.
|
||||
|
||||
## Example: Start with Postgres 16
|
||||
|
||||
To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
|
||||
|
||||
```shell
|
||||
cargo neon init --pg-version 16
|
||||
cargo neon start
|
||||
cargo neon tenant create --set-default --pg-version 16
|
||||
cargo neon endpoint create main --pg-version 16
|
||||
cargo neon endpoint start main
|
||||
```
|
||||
|
||||
## Example: Create Test User and Database
|
||||
|
||||
By default, `cargo neon` starts an endpoint with `cloud_admin` and `postgres` database. If you want to have a role and a database similar to what we have on the cloud service, you can do it with the following commands when starting an endpoint.
|
||||
|
||||
```shell
|
||||
cargo neon endpoint create main --pg-version 16 --update-catalog true
|
||||
cargo neon endpoint start main --create-test-user true
|
||||
```
|
||||
|
||||
The first command creates `neon_superuser` and necessary roles. The second command creates `test` user and `neondb` database. You will see a connection string that connects you to the test user after running the second command.
|
||||
@@ -1,2 +0,0 @@
|
||||
ALTER TABLE tenant_shards ALTER generation SET NOT NULL;
|
||||
ALTER TABLE tenant_shards ALTER generation_pageserver SET NOT NULL;
|
||||
@@ -1,4 +0,0 @@
|
||||
|
||||
|
||||
ALTER TABLE tenant_shards ALTER generation DROP NOT NULL;
|
||||
ALTER TABLE tenant_shards ALTER generation_pageserver DROP NOT NULL;
|
||||
@@ -1,10 +1,9 @@
|
||||
use crate::reconciler::ReconcileError;
|
||||
use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
|
||||
use crate::PlacementPolicy;
|
||||
use hyper::{Body, Request, Response};
|
||||
use hyper::{StatusCode, Uri};
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
|
||||
TenantTimeTravelRequest, TimelineCreateRequest,
|
||||
};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -118,14 +117,9 @@ async fn handle_tenant_create(
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
|
||||
|
||||
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
|
||||
// have no expectation of HA).
|
||||
let placement_policy = PlacementPolicy::Single;
|
||||
|
||||
json_response(
|
||||
StatusCode::CREATED,
|
||||
service.tenant_create(create_req, placement_policy).await?,
|
||||
service.tenant_create(create_req).await?,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -191,27 +185,6 @@ async fn handle_tenant_location_config(
|
||||
)
|
||||
}
|
||||
|
||||
async fn handle_tenant_config_set(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
let config_req = json_request::<TenantConfigRequest>(&mut req).await?;
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_config_set(config_req).await?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_config_get(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
check_permissions(&req, Scope::PageServerApi)?;
|
||||
|
||||
json_response(StatusCode::OK, service.tenant_config_get(tenant_id)?)
|
||||
}
|
||||
|
||||
async fn handle_tenant_time_travel_remote_storage(
|
||||
service: Arc<Service>,
|
||||
mut req: Request<Body>,
|
||||
@@ -243,15 +216,7 @@ async fn handle_tenant_time_travel_remote_storage(
|
||||
done_if_after_raw,
|
||||
)
|
||||
.await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handle_tenant_secondary_download(
|
||||
service: Arc<Service>,
|
||||
req: Request<Body>,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
|
||||
service.tenant_secondary_download(tenant_id).await?;
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
@@ -586,21 +551,12 @@ pub fn make_router(
|
||||
.delete("/v1/tenant/:tenant_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_delete)
|
||||
})
|
||||
.put("/v1/tenant/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_set)
|
||||
})
|
||||
.get("/v1/tenant/:tenant_id/config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_config_get)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/location_config", |r| {
|
||||
tenant_service_handler(r, handle_tenant_location_config)
|
||||
})
|
||||
.put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
|
||||
tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/secondary/download", |r| {
|
||||
tenant_service_handler(r, handle_tenant_secondary_download)
|
||||
})
|
||||
// Timeline operations
|
||||
.delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
|
||||
tenant_service_handler(r, handle_tenant_timeline_delete)
|
||||
|
||||
@@ -13,20 +13,14 @@ mod schema;
|
||||
pub mod service;
|
||||
mod tenant_state;
|
||||
|
||||
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
|
||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||
enum PlacementPolicy {
|
||||
/// Cheapest way to attach a tenant: just one pageserver, no secondary
|
||||
Single,
|
||||
/// Production-ready way to attach a tenant: one attached pageserver and
|
||||
/// some number of secondaries.
|
||||
Double(usize),
|
||||
/// Create one secondary mode locations. This is useful when onboarding
|
||||
/// a tenant, or for an idle tenant that we might want to bring online quickly.
|
||||
Secondary,
|
||||
|
||||
/// Do not attach to any pageservers. This is appropriate for tenants that
|
||||
/// have been idle for a long time, where we do not mind some delay in making
|
||||
/// them available in future.
|
||||
/// Do not attach to any pageservers
|
||||
Detached,
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ use attachment_service::http::make_router;
|
||||
use attachment_service::metrics::preinitialize_metrics;
|
||||
use attachment_service::persistence::Persistence;
|
||||
use attachment_service::service::{Config, Service};
|
||||
use aws_config::{BehaviorVersion, Region};
|
||||
use aws_config::{self, BehaviorVersion, Region};
|
||||
use camino::Utf8PathBuf;
|
||||
use clap::Parser;
|
||||
use diesel::Connection;
|
||||
@@ -79,38 +79,13 @@ impl Secrets {
|
||||
"neon-storage-controller-control-plane-jwt-token";
|
||||
const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
|
||||
|
||||
const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
|
||||
const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
|
||||
const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
|
||||
const PUBLIC_KEY_ENV: &'static str = "PUBLIC_KEY";
|
||||
|
||||
/// Load secrets from, in order of preference:
|
||||
/// - CLI args if database URL is provided on the CLI
|
||||
/// - Environment variables if DATABASE_URL is set.
|
||||
/// - AWS Secrets Manager secrets
|
||||
async fn load(args: &Cli) -> anyhow::Result<Self> {
|
||||
match &args.database_url {
|
||||
Some(url) => Self::load_cli(url, args),
|
||||
None => match std::env::var(Self::DATABASE_URL_ENV) {
|
||||
Ok(database_url) => Self::load_env(database_url),
|
||||
Err(_) => Self::load_aws_sm().await,
|
||||
},
|
||||
None => Self::load_aws_sm().await,
|
||||
}
|
||||
}
|
||||
|
||||
fn load_env(database_url: String) -> anyhow::Result<Self> {
|
||||
let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
|
||||
Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
|
||||
Err(_) => None,
|
||||
};
|
||||
Ok(Self {
|
||||
database_url,
|
||||
public_key,
|
||||
jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
|
||||
control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
|
||||
})
|
||||
}
|
||||
|
||||
async fn load_aws_sm() -> anyhow::Result<Self> {
|
||||
let Ok(region) = std::env::var("AWS_REGION") else {
|
||||
anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
|
||||
|
||||
@@ -7,10 +7,8 @@ use self::split_state::SplitState;
|
||||
use camino::Utf8Path;
|
||||
use camino::Utf8PathBuf;
|
||||
use diesel::pg::PgConnection;
|
||||
use diesel::{
|
||||
Connection, ExpressionMethods, Insertable, QueryDsl, QueryResult, Queryable, RunQueryDsl,
|
||||
Selectable, SelectableHelper,
|
||||
};
|
||||
use diesel::prelude::*;
|
||||
use diesel::Connection;
|
||||
use pageserver_api::controller_api::NodeSchedulingPolicy;
|
||||
use pageserver_api::models::TenantConfig;
|
||||
use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
|
||||
@@ -333,15 +331,7 @@ impl Persistence {
|
||||
shard_number: ShardNumber(tsp.shard_number as u8),
|
||||
shard_count: ShardCount::new(tsp.shard_count as u8),
|
||||
};
|
||||
|
||||
let Some(g) = tsp.generation else {
|
||||
// If the generation_pageserver column was non-NULL, then the generation column should also be non-NULL:
|
||||
// we only set generation_pageserver when setting generation.
|
||||
return Err(DatabaseError::Logical(
|
||||
"Generation should always be set after incrementing".to_string(),
|
||||
));
|
||||
};
|
||||
result.insert(tenant_shard_id, Generation::new(g as u32));
|
||||
result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
@@ -374,85 +364,7 @@ impl Persistence {
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Generation is always non-null in the rseult: if the generation column had been NULL, then we
|
||||
// should have experienced an SQL Confilict error while executing a query that tries to increment it.
|
||||
debug_assert!(updated.generation.is_some());
|
||||
let Some(g) = updated.generation else {
|
||||
return Err(DatabaseError::Logical(
|
||||
"Generation should always be set after incrementing".to_string(),
|
||||
)
|
||||
.into());
|
||||
};
|
||||
|
||||
Ok(Generation::new(g as u32))
|
||||
}
|
||||
|
||||
/// For use when updating a persistent property of a tenant, such as its config or placement_policy.
|
||||
///
|
||||
/// Do not use this for settting generation, unless in the special onboarding code path (/location_config)
|
||||
/// API: use [`Self::increment_generation`] instead. Setting the generation via this route is a one-time thing
|
||||
/// that we only do the first time a tenant is set to an attached policy via /location_config.
|
||||
pub(crate) async fn update_tenant_shard(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
input_placement_policy: PlacementPolicy,
|
||||
input_config: TenantConfig,
|
||||
input_generation: Option<Generation>,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| {
|
||||
let query = diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
|
||||
|
||||
if let Some(input_generation) = input_generation {
|
||||
// Update includes generation column
|
||||
query
|
||||
.set((
|
||||
generation.eq(Some(input_generation.into().unwrap() as i32)),
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
} else {
|
||||
// Update does not include generation column
|
||||
query
|
||||
.set((
|
||||
placement_policy
|
||||
.eq(serde_json::to_string(&input_placement_policy).unwrap()),
|
||||
config.eq(serde_json::to_string(&input_config).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn update_tenant_config(
|
||||
&self,
|
||||
input_tenant_id: TenantId,
|
||||
input_config: TenantConfig,
|
||||
) -> DatabaseResult<()> {
|
||||
use crate::schema::tenant_shards::dsl::*;
|
||||
|
||||
self.with_conn(move |conn| {
|
||||
diesel::update(tenant_shards)
|
||||
.filter(tenant_id.eq(input_tenant_id.to_string()))
|
||||
.set((config.eq(serde_json::to_string(&input_config).unwrap()),))
|
||||
.execute(conn)?;
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
Ok(Generation::new(updated.generation as u32))
|
||||
}
|
||||
|
||||
pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
|
||||
@@ -463,7 +375,7 @@ impl Persistence {
|
||||
.filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
|
||||
.filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
|
||||
.set((
|
||||
generation_pageserver.eq(Option::<i64>::None),
|
||||
generation_pageserver.eq(i64::MAX),
|
||||
placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
|
||||
))
|
||||
.execute(conn)?;
|
||||
@@ -589,15 +501,12 @@ pub(crate) struct TenantShardPersistence {
|
||||
pub(crate) shard_stripe_size: i32,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching.
|
||||
//
|
||||
// Generation is only None when first onboarding a tenant, where it may
|
||||
// be in PlacementPolicy::Secondary and therefore have no valid generation state.
|
||||
pub(crate) generation: Option<i32>,
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: i32,
|
||||
|
||||
// Currently attached pageserver
|
||||
#[serde(rename = "pageserver")]
|
||||
pub(crate) generation_pageserver: Option<i64>,
|
||||
pub(crate) generation_pageserver: i64,
|
||||
|
||||
#[serde(default)]
|
||||
pub(crate) placement_policy: String,
|
||||
|
||||
@@ -26,7 +26,7 @@ pub(super) struct Reconciler {
|
||||
/// of a tenant's state from when we spawned a reconcile task.
|
||||
pub(super) tenant_shard_id: TenantShardId,
|
||||
pub(crate) shard: ShardIdentity,
|
||||
pub(crate) generation: Option<Generation>,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) intent: TargetState,
|
||||
pub(crate) config: TenantConfig,
|
||||
pub(crate) observed: ObservedState,
|
||||
@@ -312,7 +312,7 @@ impl Reconciler {
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedStale,
|
||||
self.generation,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
|
||||
@@ -335,17 +335,16 @@ impl Reconciler {
|
||||
}
|
||||
|
||||
// Increment generation before attaching to new pageserver
|
||||
self.generation = Some(
|
||||
self.persistence
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?,
|
||||
);
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, dest_ps_id)
|
||||
.await?;
|
||||
|
||||
let dest_conf = build_location_config(
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedMulti,
|
||||
self.generation,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
|
||||
@@ -402,7 +401,7 @@ impl Reconciler {
|
||||
&self.shard,
|
||||
&self.config,
|
||||
LocationConfigMode::AttachedSingle,
|
||||
self.generation,
|
||||
Some(self.generation),
|
||||
None,
|
||||
);
|
||||
self.location_config(dest_ps_id, dest_final_conf.clone(), None)
|
||||
@@ -434,62 +433,22 @@ impl Reconciler {
|
||||
|
||||
// If the attached pageserver is not attached, do so now.
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
// If we are in an attached policy, then generation must have been set (null generations
|
||||
// are only present when a tenant is initially loaded with a secondary policy)
|
||||
debug_assert!(self.generation.is_some());
|
||||
let Some(generation) = self.generation else {
|
||||
return Err(ReconcileError::Other(anyhow::anyhow!(
|
||||
"Attempted to attach with NULL generation"
|
||||
)));
|
||||
};
|
||||
|
||||
let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
let mut wanted_conf =
|
||||
attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
|
||||
// Nothing to do
|
||||
tracing::info!(%node_id, "Observed configuration already correct.")
|
||||
}
|
||||
observed => {
|
||||
_ => {
|
||||
// In all cases other than a matching observed configuration, we will
|
||||
// reconcile this location. This includes locations with different configurations, as well
|
||||
// as locations with unknown (None) observed state.
|
||||
|
||||
// The general case is to increment the generation. However, there are cases
|
||||
// where this is not necessary:
|
||||
// - if we are only updating the TenantConf part of the location
|
||||
// - if we are only changing the attachment mode (e.g. going to attachedmulti or attachedstale)
|
||||
// and the location was already in the correct generation
|
||||
let increment_generation = match observed {
|
||||
None => true,
|
||||
Some(ObservedStateLocation { conf: None }) => true,
|
||||
Some(ObservedStateLocation {
|
||||
conf: Some(observed),
|
||||
}) => {
|
||||
let generations_match = observed.generation == wanted_conf.generation;
|
||||
|
||||
use LocationConfigMode::*;
|
||||
let mode_transition_requires_gen_inc =
|
||||
match (observed.mode, wanted_conf.mode) {
|
||||
// Usually the short-lived attachment modes (multi and stale) are only used
|
||||
// in the case of [`Self::live_migrate`], but it is simple to handle them correctly
|
||||
// here too. Locations are allowed to go Single->Stale and Multi->Single within the same generation.
|
||||
(AttachedSingle, AttachedStale) => false,
|
||||
(AttachedMulti, AttachedSingle) => false,
|
||||
(lhs, rhs) => lhs != rhs,
|
||||
};
|
||||
|
||||
!generations_match || mode_transition_requires_gen_inc
|
||||
}
|
||||
};
|
||||
|
||||
if increment_generation {
|
||||
let generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
self.generation = Some(generation);
|
||||
wanted_conf.generation = generation.into();
|
||||
}
|
||||
self.generation = self
|
||||
.persistence
|
||||
.increment_generation(self.tenant_shard_id, node_id)
|
||||
.await?;
|
||||
wanted_conf.generation = self.generation.into();
|
||||
tracing::info!(%node_id, "Observed configuration requires update.");
|
||||
self.location_config(node_id, wanted_conf, None).await?;
|
||||
self.compute_notify().await?;
|
||||
|
||||
@@ -284,6 +284,7 @@ pub(crate) mod test_utils {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use utils::id::NodeId;
|
||||
|
||||
use crate::tenant_state::IntentState;
|
||||
#[test]
|
||||
|
||||
@@ -17,8 +17,8 @@ diesel::table! {
|
||||
shard_number -> Int4,
|
||||
shard_count -> Int4,
|
||||
shard_stripe_size -> Int4,
|
||||
generation -> Nullable<Int4>,
|
||||
generation_pageserver -> Nullable<Int8>,
|
||||
generation -> Int4,
|
||||
generation_pageserver -> Int8,
|
||||
placement_policy -> Varchar,
|
||||
splitting -> Int2,
|
||||
config -> Text,
|
||||
|
||||
@@ -14,13 +14,10 @@ use control_plane::attachment_service::{
|
||||
use diesel::result::DatabaseErrorKind;
|
||||
use futures::{stream::FuturesUnordered, StreamExt};
|
||||
use hyper::StatusCode;
|
||||
use pageserver_api::{
|
||||
controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
|
||||
TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
},
|
||||
models::TenantConfigRequest,
|
||||
use pageserver_api::controller_api::{
|
||||
NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy,
|
||||
TenantCreateResponse, TenantCreateResponseShard, TenantLocateResponse,
|
||||
TenantLocateResponseShard, TenantShardMigrateRequest, TenantShardMigrateResponse,
|
||||
};
|
||||
use pageserver_api::{
|
||||
models::{
|
||||
@@ -68,11 +65,6 @@ const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
|
||||
// some data in it.
|
||||
const RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
|
||||
// If we receive a call using Secondary mode initially, it will omit generation. We will initialize
|
||||
// tenant shards into this generation, and as long as it remains in this generation, we will accept
|
||||
// input generation from future requests as authoritative.
|
||||
const INITIAL_GENERATION: Generation = Generation::new(0);
|
||||
|
||||
/// How long [`Service::startup_reconcile`] is allowed to take before it should give
|
||||
/// up on unresponsive pageservers and proceed.
|
||||
pub(crate) const STARTUP_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30);
|
||||
@@ -175,21 +167,6 @@ impl From<ReconcileWaitError> for ApiError {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::large_enum_variant)]
|
||||
enum TenantCreateOrUpdate {
|
||||
Create((TenantCreateRequest, PlacementPolicy)),
|
||||
Update(Vec<ShardUpdate>),
|
||||
}
|
||||
|
||||
struct ShardUpdate {
|
||||
tenant_shard_id: TenantShardId,
|
||||
placement_policy: PlacementPolicy,
|
||||
tenant_config: TenantConfig,
|
||||
|
||||
/// If this is None, generation is not updated.
|
||||
generation: Option<Generation>,
|
||||
}
|
||||
|
||||
impl Service {
|
||||
pub fn get_config(&self) -> &Config {
|
||||
&self.config
|
||||
@@ -594,9 +571,6 @@ impl Service {
|
||||
// the shard so that a future [`TenantState::maybe_reconcile`] will try again.
|
||||
tenant.pending_compute_notification = result.pending_compute_notification;
|
||||
|
||||
// Let the TenantState know it is idle.
|
||||
tenant.reconcile_complete(result.sequence);
|
||||
|
||||
match result.result {
|
||||
Ok(()) => {
|
||||
for (node_id, loc) in &result.observed.locations {
|
||||
@@ -687,8 +661,8 @@ impl Service {
|
||||
// after when pageservers start up and register.
|
||||
let mut node_ids = HashSet::new();
|
||||
for tsp in &tenant_shard_persistence {
|
||||
if let Some(node_id) = tsp.generation_pageserver {
|
||||
node_ids.insert(node_id);
|
||||
if tsp.generation_pageserver != i64::MAX {
|
||||
node_ids.insert(tsp.generation_pageserver);
|
||||
}
|
||||
}
|
||||
for node_id in node_ids {
|
||||
@@ -725,15 +699,18 @@ impl Service {
|
||||
// We will populate intent properly later in [`Self::startup_reconcile`], initially populate
|
||||
// it with what we can infer: the node for which a generation was most recently issued.
|
||||
let mut intent = IntentState::new();
|
||||
if let Some(generation_pageserver) = tsp.generation_pageserver {
|
||||
intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64)));
|
||||
if tsp.generation_pageserver != i64::MAX {
|
||||
intent.set_attached(
|
||||
&mut scheduler,
|
||||
Some(NodeId(tsp.generation_pageserver as u64)),
|
||||
);
|
||||
}
|
||||
|
||||
let new_tenant = TenantState {
|
||||
tenant_shard_id,
|
||||
shard: shard_identity,
|
||||
sequence: Sequence::initial(),
|
||||
generation: tsp.generation.map(|g| Generation::new(g as u32)),
|
||||
generation: Generation::new(tsp.generation as u32),
|
||||
policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
|
||||
intent,
|
||||
observed: ObservedState::new(),
|
||||
@@ -813,8 +790,8 @@ impl Service {
|
||||
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: 0,
|
||||
generation: Some(0),
|
||||
generation_pageserver: None,
|
||||
generation: 0,
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&PlacementPolicy::default()).unwrap(),
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
@@ -869,7 +846,7 @@ impl Service {
|
||||
.expect("Checked for existence above");
|
||||
|
||||
if let Some(new_generation) = new_generation {
|
||||
tenant_state.generation = Some(new_generation);
|
||||
tenant_state.generation = new_generation;
|
||||
} else {
|
||||
// This is a detach notification. We must update placement policy to avoid re-attaching
|
||||
// during background scheduling/reconciliation, or during attachment service restart.
|
||||
@@ -919,7 +896,7 @@ impl Service {
|
||||
node_id,
|
||||
ObservedStateLocation {
|
||||
conf: Some(attached_location_conf(
|
||||
tenant_state.generation.unwrap(),
|
||||
tenant_state.generation,
|
||||
&tenant_state.shard,
|
||||
&tenant_state.config,
|
||||
)),
|
||||
@@ -933,7 +910,7 @@ impl Service {
|
||||
Ok(AttachHookResponse {
|
||||
gen: attach_req
|
||||
.node_id
|
||||
.map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()),
|
||||
.map(|_| tenant_state.generation.into().unwrap()),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -946,7 +923,7 @@ impl Service {
|
||||
attachment: tenant_state.and_then(|s| {
|
||||
s.intent
|
||||
.get_attached()
|
||||
.map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps))
|
||||
.map(|ps| (s.generation.into().unwrap(), ps))
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -996,17 +973,7 @@ impl Service {
|
||||
continue;
|
||||
};
|
||||
|
||||
// If [`Persistence::re_attach`] selected this shard, it must have alread
|
||||
// had a generation set.
|
||||
debug_assert!(shard_state.generation.is_some());
|
||||
let Some(old_gen) = shard_state.generation else {
|
||||
// Should never happen: would only return incremented generation
|
||||
// for a tenant that already had a non-null generation.
|
||||
return Err(ApiError::InternalServerError(anyhow::anyhow!(
|
||||
"Generation must be set while re-attaching"
|
||||
)));
|
||||
};
|
||||
shard_state.generation = Some(std::cmp::max(old_gen, new_gen));
|
||||
shard_state.generation = std::cmp::max(shard_state.generation, new_gen);
|
||||
if let Some(observed) = shard_state
|
||||
.observed
|
||||
.locations
|
||||
@@ -1036,7 +1003,7 @@ impl Service {
|
||||
|
||||
for req_tenant in validate_req.tenants {
|
||||
if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
|
||||
let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen));
|
||||
let valid = tenant_state.generation == Generation::new(req_tenant.gen);
|
||||
tracing::info!(
|
||||
"handle_validate: {}(gen {}): valid={valid} (latest {:?})",
|
||||
req_tenant.id,
|
||||
@@ -1063,9 +1030,8 @@ impl Service {
|
||||
pub(crate) async fn tenant_create(
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
placement_policy: PlacementPolicy,
|
||||
) -> Result<TenantCreateResponse, ApiError> {
|
||||
let (response, waiters) = self.do_tenant_create(create_req, placement_policy).await?;
|
||||
let (response, waiters) = self.do_tenant_create(create_req).await?;
|
||||
|
||||
self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await?;
|
||||
Ok(response)
|
||||
@@ -1074,7 +1040,6 @@ impl Service {
|
||||
pub(crate) async fn do_tenant_create(
|
||||
&self,
|
||||
create_req: TenantCreateRequest,
|
||||
placement_policy: PlacementPolicy,
|
||||
) -> Result<(TenantCreateResponse, Vec<ReconcilerWaiter>), ApiError> {
|
||||
// This service expects to handle sharding itself: it is an error to try and directly create
|
||||
// a particular shard here.
|
||||
@@ -1100,27 +1065,9 @@ impl Service {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
// If the caller specifies a None generation, it means "start from default". This is different
|
||||
// to [`Self::tenant_location_config`], where a None generation is used to represent
|
||||
// an incompletely-onboarded tenant.
|
||||
let initial_generation = if matches!(placement_policy, PlacementPolicy::Secondary) {
|
||||
tracing::info!(
|
||||
"tenant_create: secondary mode, generation is_some={}",
|
||||
create_req.generation.is_some()
|
||||
);
|
||||
create_req.generation.map(Generation::new)
|
||||
} else {
|
||||
tracing::info!(
|
||||
"tenant_create: not secondary mode, generation is_some={}",
|
||||
create_req.generation.is_some()
|
||||
);
|
||||
Some(
|
||||
create_req
|
||||
.generation
|
||||
.map(Generation::new)
|
||||
.unwrap_or(INITIAL_GENERATION),
|
||||
)
|
||||
};
|
||||
// TODO: enable specifying this. Using Single as a default helps legacy tests to work (they
|
||||
// have no expectation of HA).
|
||||
let placement_policy: PlacementPolicy = PlacementPolicy::Single;
|
||||
|
||||
// Ordering: we persist tenant shards before creating them on the pageserver. This enables a caller
|
||||
// to clean up after themselves by issuing a tenant deletion if something goes wrong and we restart
|
||||
@@ -1132,10 +1079,8 @@ impl Service {
|
||||
shard_number: tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
|
||||
generation: initial_generation.map(|g| g.into().unwrap() as i32),
|
||||
// The pageserver is not known until scheduling happens: we will set this column when
|
||||
// incrementing the generation the first time we attach to a pageserver.
|
||||
generation_pageserver: None,
|
||||
generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
|
||||
generation_pageserver: i64::MAX,
|
||||
placement_policy: serde_json::to_string(&placement_policy).unwrap(),
|
||||
config: serde_json::to_string(&create_req.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
@@ -1175,17 +1120,15 @@ impl Service {
|
||||
))
|
||||
})?;
|
||||
|
||||
if let Some(node_id) = entry.get().intent.get_attached() {
|
||||
let generation = entry
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: entry
|
||||
.get()
|
||||
.generation
|
||||
.expect("Generation is set when in attached mode");
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: *node_id,
|
||||
generation: generation.into().unwrap(),
|
||||
});
|
||||
}
|
||||
.intent
|
||||
.get_attached()
|
||||
.expect("We just set pageserver if it was None"),
|
||||
generation: entry.get().generation.into().unwrap(),
|
||||
});
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -1199,7 +1142,9 @@ impl Service {
|
||||
placement_policy.clone(),
|
||||
);
|
||||
|
||||
state.generation = initial_generation;
|
||||
if let Some(create_gen) = create_req.generation {
|
||||
state.generation = Generation::new(create_gen);
|
||||
}
|
||||
state.config = create_req.config.clone();
|
||||
|
||||
state.schedule(scheduler).map_err(|e| {
|
||||
@@ -1208,18 +1153,14 @@ impl Service {
|
||||
))
|
||||
})?;
|
||||
|
||||
// Only include shards in result if we are attaching: the purpose
|
||||
// of the response is to tell the caller where the shards are attached.
|
||||
if let Some(node_id) = state.intent.get_attached() {
|
||||
let generation = state
|
||||
.generation
|
||||
.expect("Generation is set when in attached mode");
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: *node_id,
|
||||
generation: generation.into().unwrap(),
|
||||
});
|
||||
}
|
||||
response_shards.push(TenantCreateResponseShard {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: state
|
||||
.intent
|
||||
.get_attached()
|
||||
.expect("We just set pageserver if it was None"),
|
||||
generation: state.generation.into().unwrap(),
|
||||
});
|
||||
entry.insert(state)
|
||||
}
|
||||
};
|
||||
@@ -1273,114 +1214,12 @@ impl Service {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
|
||||
/// and transform it into either a tenant creation of a series of shard updates.
|
||||
fn tenant_location_config_prepare(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
req: TenantLocationConfigRequest,
|
||||
) -> TenantCreateOrUpdate {
|
||||
let mut updates = Vec::new();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
|
||||
// Use location config mode as an indicator of policy.
|
||||
let placement_policy = match req.config.mode {
|
||||
LocationConfigMode::Detached => PlacementPolicy::Detached,
|
||||
LocationConfigMode::Secondary => PlacementPolicy::Secondary,
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
if nodes.len() > 1 {
|
||||
PlacementPolicy::Double(1)
|
||||
} else {
|
||||
// Convenience for dev/test: if we just have one pageserver, import
|
||||
// tenants into Single mode so that scheduling will succeed.
|
||||
PlacementPolicy::Single
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut create = true;
|
||||
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
// Saw an existing shard: this is not a creation
|
||||
create = false;
|
||||
|
||||
// Shards may have initially been created by a Secondary request, where we
|
||||
// would have left generation as None.
|
||||
//
|
||||
// We only update generation the first time we see an attached-mode request,
|
||||
// and if there is no existing generation set. The caller is responsible for
|
||||
// ensuring that no non-storage-controller pageserver ever uses a higher
|
||||
// generation than they passed in here.
|
||||
use LocationConfigMode::*;
|
||||
let set_generation = match req.config.mode {
|
||||
AttachedMulti | AttachedSingle | AttachedStale if shard.generation.is_none() => {
|
||||
req.config.generation.map(Generation::new)
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
|
||||
if shard.policy != placement_policy
|
||||
|| shard.config != req.config.tenant_conf
|
||||
|| set_generation.is_some()
|
||||
{
|
||||
updates.push(ShardUpdate {
|
||||
tenant_shard_id: *shard_id,
|
||||
placement_policy: placement_policy.clone(),
|
||||
tenant_config: req.config.tenant_conf.clone(),
|
||||
generation: set_generation,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if create {
|
||||
use LocationConfigMode::*;
|
||||
let generation = match req.config.mode {
|
||||
AttachedMulti | AttachedSingle | AttachedStale => req.config.generation,
|
||||
// If a caller provided a generation in a non-attached request, ignore it
|
||||
// and leave our generation as None: this enables a subsequent update to set
|
||||
// the generation when setting an attached mode for the first time.
|
||||
_ => None,
|
||||
};
|
||||
|
||||
TenantCreateOrUpdate::Create(
|
||||
// Synthesize a creation request
|
||||
(
|
||||
TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation,
|
||||
shard_parameters: ShardParameters {
|
||||
// Must preserve the incoming shard_count do distinguish unsharded (0)
|
||||
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
|
||||
count: req.tenant_id.shard_count,
|
||||
// We only import un-sharded or single-sharded tenants, so stripe
|
||||
// size can be made up arbitrarily here.
|
||||
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
},
|
||||
config: req.config.tenant_conf,
|
||||
},
|
||||
placement_policy,
|
||||
),
|
||||
)
|
||||
} else {
|
||||
TenantCreateOrUpdate::Update(updates)
|
||||
}
|
||||
}
|
||||
|
||||
/// This API is used by the cloud control plane to migrate unsharded tenants that it created
|
||||
/// directly with pageservers into this service.
|
||||
///
|
||||
/// Cloud control plane MUST NOT continue issuing GENERATION NUMBERS for this tenant once it
|
||||
/// has attempted to call this API. Failure to oblige to this rule may lead to S3 corruption.
|
||||
/// Think of the first attempt to call this API as a transfer of absolute authority over the
|
||||
/// tenant's source of generation numbers.
|
||||
///
|
||||
/// The mode in this request coarse-grained control of tenants:
|
||||
/// This API is used by the cloud control plane to do coarse-grained control of tenants:
|
||||
/// - Call with mode Attached* to upsert the tenant.
|
||||
/// - Call with mode Secondary to either onboard a tenant without attaching it, or
|
||||
/// to set an existing tenant to PolicyMode::Secondary
|
||||
/// - Call with mode Detached to switch to PolicyMode::Detached
|
||||
///
|
||||
/// In future, calling with mode Secondary may switch to a detach-lite mode in which a tenant only has
|
||||
/// secondary locations.
|
||||
pub(crate) async fn tenant_location_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
@@ -1392,96 +1231,131 @@ impl Service {
|
||||
)));
|
||||
}
|
||||
|
||||
// First check if this is a creation or an update
|
||||
let create_or_update = self.tenant_location_config_prepare(tenant_id, req);
|
||||
|
||||
let mut waiters = Vec::new();
|
||||
let mut result = TenantLocationConfigResponse { shards: Vec::new() };
|
||||
let waiters = match create_or_update {
|
||||
TenantCreateOrUpdate::Create((create_req, placement_policy)) => {
|
||||
let (create_resp, waiters) =
|
||||
self.do_tenant_create(create_req, placement_policy).await?;
|
||||
result.shards = create_resp
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|s| TenantShardLocation {
|
||||
node_id: s.node_id,
|
||||
shard_id: s.shard_id,
|
||||
})
|
||||
.collect();
|
||||
waiters
|
||||
}
|
||||
TenantCreateOrUpdate::Update(updates) => {
|
||||
// Persist updates
|
||||
// Ordering: write to the database before applying changes in-memory, so that
|
||||
// we will not appear time-travel backwards on a restart.
|
||||
for ShardUpdate {
|
||||
tenant_shard_id,
|
||||
placement_policy,
|
||||
tenant_config,
|
||||
generation,
|
||||
} in &updates
|
||||
{
|
||||
self.persistence
|
||||
.update_tenant_shard(
|
||||
*tenant_shard_id,
|
||||
placement_policy.clone(),
|
||||
tenant_config.clone(),
|
||||
*generation,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
let maybe_create = {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
|
||||
// Apply updates in-memory
|
||||
let mut waiters = Vec::new();
|
||||
{
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, scheduler) = locked.parts_mut();
|
||||
// Maybe we have existing shards
|
||||
let mut create = true;
|
||||
for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
// Saw an existing shard: this is not a creation
|
||||
create = false;
|
||||
|
||||
for ShardUpdate {
|
||||
tenant_shard_id,
|
||||
placement_policy,
|
||||
tenant_config,
|
||||
generation: update_generation,
|
||||
} in updates
|
||||
{
|
||||
let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
|
||||
tracing::warn!("Shard {tenant_shard_id} removed while updating");
|
||||
continue;
|
||||
};
|
||||
// Note that for existing tenants we do _not_ respect the generation in the request: this is likely
|
||||
// to be stale. Once a tenant is created in this service, our view of generation is authoritative, and
|
||||
// callers' generations may be ignored. This represents a one-way migration of tenants from the outer
|
||||
// cloud control plane into this service.
|
||||
|
||||
shard.policy = placement_policy;
|
||||
shard.config = tenant_config;
|
||||
if let Some(generation) = update_generation {
|
||||
shard.generation = Some(generation);
|
||||
}
|
||||
|
||||
shard.schedule(scheduler)?;
|
||||
|
||||
let maybe_waiter = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
);
|
||||
if let Some(waiter) = maybe_waiter {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
result.shards.push(TenantShardLocation {
|
||||
shard_id: tenant_shard_id,
|
||||
node_id: *node_id,
|
||||
})
|
||||
// Use location config mode as an indicator of policy: if they ask for
|
||||
// attached we go to default HA attached mode. If they ask for secondary
|
||||
// we go to secondary-only mode. If they ask for detached we detach.
|
||||
match req.config.mode {
|
||||
LocationConfigMode::Detached => {
|
||||
shard.policy = PlacementPolicy::Detached;
|
||||
}
|
||||
LocationConfigMode::Secondary => {
|
||||
// TODO: implement secondary-only mode.
|
||||
todo!();
|
||||
}
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// TODO: persistence for changes in policy
|
||||
if nodes.len() > 1 {
|
||||
shard.policy = PlacementPolicy::Double(1)
|
||||
} else {
|
||||
// Convenience for dev/test: if we just have one pageserver, import
|
||||
// tenants into Single mode so that scheduling will succeed.
|
||||
shard.policy = PlacementPolicy::Single
|
||||
}
|
||||
}
|
||||
}
|
||||
waiters
|
||||
|
||||
shard.schedule(scheduler)?;
|
||||
|
||||
let maybe_waiter = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
);
|
||||
if let Some(waiter) = maybe_waiter {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
|
||||
if let Some(node_id) = shard.intent.get_attached() {
|
||||
result.shards.push(TenantShardLocation {
|
||||
shard_id: *shard_id,
|
||||
node_id: *node_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if create {
|
||||
// Validate request mode
|
||||
match req.config.mode {
|
||||
LocationConfigMode::Detached | LocationConfigMode::Secondary => {
|
||||
// When using this API to onboard an existing tenant to this service, it must start in
|
||||
// an attached state, because we need the request to come with a generation
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Imported tenant must be in attached mode"
|
||||
)));
|
||||
}
|
||||
|
||||
LocationConfigMode::AttachedMulti
|
||||
| LocationConfigMode::AttachedSingle
|
||||
| LocationConfigMode::AttachedStale => {
|
||||
// Pass
|
||||
}
|
||||
}
|
||||
|
||||
// Validate request generation
|
||||
let Some(generation) = req.config.generation else {
|
||||
// We can only import attached tenants, because we need the request to come with a generation
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
"Generation is mandatory when importing tenant"
|
||||
)));
|
||||
};
|
||||
|
||||
// Synthesize a creation request
|
||||
Some(TenantCreateRequest {
|
||||
new_tenant_id: TenantShardId::unsharded(tenant_id),
|
||||
generation: Some(generation),
|
||||
shard_parameters: ShardParameters {
|
||||
// Must preserve the incoming shard_count do distinguish unsharded (0)
|
||||
// from single-sharded (1): this distinction appears in the S3 keys of the tenant.
|
||||
count: req.tenant_id.shard_count,
|
||||
// We only import un-sharded or single-sharded tenants, so stripe
|
||||
// size can be made up arbitrarily here.
|
||||
stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
|
||||
},
|
||||
config: req.config.tenant_conf,
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let waiters = if let Some(create_req) = maybe_create {
|
||||
let (create_resp, waiters) = self.do_tenant_create(create_req).await?;
|
||||
result.shards = create_resp
|
||||
.shards
|
||||
.into_iter()
|
||||
.map(|s| TenantShardLocation {
|
||||
node_id: s.node_id,
|
||||
shard_id: s.shard_id,
|
||||
})
|
||||
.collect();
|
||||
waiters
|
||||
} else {
|
||||
waiters
|
||||
};
|
||||
|
||||
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
|
||||
@@ -1501,91 +1375,6 @@ impl Service {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_config_set(&self, req: TenantConfigRequest) -> Result<(), ApiError> {
|
||||
let tenant_id = req.tenant_id;
|
||||
let config = req.config;
|
||||
|
||||
self.persistence
|
||||
.update_tenant_config(req.tenant_id, config.clone())
|
||||
.await?;
|
||||
|
||||
let waiters = {
|
||||
let mut waiters = Vec::new();
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let result_tx = locked.result_tx.clone();
|
||||
let compute_hook = locked.compute_hook.clone();
|
||||
let (nodes, tenants, _scheduler) = locked.parts_mut();
|
||||
for (_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
|
||||
shard.config = config.clone();
|
||||
if let Some(waiter) = shard.maybe_reconcile(
|
||||
result_tx.clone(),
|
||||
nodes,
|
||||
&compute_hook,
|
||||
&self.config,
|
||||
&self.persistence,
|
||||
&self.gate,
|
||||
&self.cancel,
|
||||
) {
|
||||
waiters.push(waiter);
|
||||
}
|
||||
}
|
||||
waiters
|
||||
};
|
||||
|
||||
if let Err(e) = self.await_waiters(waiters, SHORT_RECONCILE_TIMEOUT).await {
|
||||
// Treat this as success because we have stored the configuration. If e.g.
|
||||
// a node was unavailable at this time, it should not stop us accepting a
|
||||
// configuration change.
|
||||
tracing::warn!(%tenant_id, "Accepted configuration update but reconciliation failed: {e}");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_config_get(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<HashMap<&str, serde_json::Value>, ApiError> {
|
||||
let config = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
|
||||
match locked
|
||||
.tenants
|
||||
.range(TenantShardId::tenant_range(tenant_id))
|
||||
.next()
|
||||
{
|
||||
Some((_tenant_shard_id, shard)) => shard.config.clone(),
|
||||
None => {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Tenant not found").into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Unlike the pageserver, we do not have a set of global defaults: the config is
|
||||
// entirely per-tenant. Therefore the distinction between `tenant_specific_overrides`
|
||||
// and `effective_config` in the response is meaningless, but we retain that syntax
|
||||
// in order to remain compatible with the pageserver API.
|
||||
|
||||
let response = HashMap::from([
|
||||
(
|
||||
"tenant_specific_overrides",
|
||||
serde_json::to_value(&config)
|
||||
.context("serializing tenant specific overrides")
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
),
|
||||
(
|
||||
"effective_config",
|
||||
serde_json::to_value(&config)
|
||||
.context("serializing effective config")
|
||||
.map_err(ApiError::InternalServerError)?,
|
||||
),
|
||||
]);
|
||||
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_time_travel_remote_storage(
|
||||
&self,
|
||||
time_travel_req: &TenantTimeTravelRequest,
|
||||
@@ -1671,60 +1460,6 @@ impl Service {
|
||||
})?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn tenant_secondary_download(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), ApiError> {
|
||||
// Acquire lock and yield the collection of shard-node tuples which we will send requests onward to
|
||||
let targets = {
|
||||
let locked = self.inner.read().unwrap();
|
||||
let mut targets = Vec::new();
|
||||
|
||||
for (tenant_shard_id, shard) in
|
||||
locked.tenants.range(TenantShardId::tenant_range(tenant_id))
|
||||
{
|
||||
for node_id in shard.intent.get_secondary() {
|
||||
let node = locked
|
||||
.nodes
|
||||
.get(node_id)
|
||||
.expect("Pageservers may not be deleted while referenced");
|
||||
|
||||
targets.push((*tenant_shard_id, node.clone()));
|
||||
}
|
||||
}
|
||||
targets
|
||||
};
|
||||
|
||||
// TODO: this API, and the underlying pageserver API, should take a timeout argument so that for long running
|
||||
// downloads, they can return a clean 202 response instead of the HTTP client timing out.
|
||||
|
||||
// Issue concurrent requests to all shards' locations
|
||||
let mut futs = FuturesUnordered::new();
|
||||
for (tenant_shard_id, node) in targets {
|
||||
let client = mgmt_api::Client::new(node.base_url(), self.config.jwt_token.as_deref());
|
||||
futs.push(async move {
|
||||
let result = client.tenant_secondary_download(tenant_shard_id).await;
|
||||
(result, node)
|
||||
})
|
||||
}
|
||||
|
||||
// Handle any errors returned by pageservers. This includes cases like this request racing with
|
||||
// a scheduling operation, such that the tenant shard we're calling doesn't exist on that pageserver any more, as
|
||||
// well as more general cases like 503s, 500s, or timeouts.
|
||||
while let Some((result, node)) = futs.next().await {
|
||||
let Err(e) = result else { continue };
|
||||
|
||||
// Secondary downloads are always advisory: if something fails, we nevertheless report success, so that whoever
|
||||
// is calling us will proceed with whatever migration they're doing, albeit with a slightly less warm cache
|
||||
// than they had hoped for.
|
||||
tracing::warn!(
|
||||
"Ignoring tenant secondary download error from pageserver {}: {e}",
|
||||
node.id,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -2304,8 +2039,8 @@ impl Service {
|
||||
// Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
|
||||
// populate the correct generation as part of its transaction, to protect us
|
||||
// against racing with changes in the state of the parent.
|
||||
generation: None,
|
||||
generation_pageserver: Some(target.node.id.0 as i64),
|
||||
generation: 0,
|
||||
generation_pageserver: target.node.id.0 as i64,
|
||||
placement_policy: serde_json::to_string(&policy).unwrap(),
|
||||
// TODO: get the config out of the map
|
||||
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
|
||||
@@ -2426,8 +2161,7 @@ impl Service {
|
||||
.expect("It was present, we just split it");
|
||||
let old_attached = old_state.intent.get_attached().unwrap();
|
||||
old_state.intent.clear(scheduler);
|
||||
let generation = old_state.generation.expect("Shard must have been attached");
|
||||
(old_attached, generation, old_state.config.clone())
|
||||
(old_attached, old_state.generation, old_state.config.clone())
|
||||
};
|
||||
|
||||
for child in child_ids {
|
||||
@@ -2448,7 +2182,7 @@ impl Service {
|
||||
child_state.observed = ObservedState {
|
||||
locations: child_observed,
|
||||
};
|
||||
child_state.generation = Some(generation);
|
||||
child_state.generation = generation;
|
||||
child_state.config = config.clone();
|
||||
|
||||
// The child's TenantState::splitting is intentionally left at the default value of Idle,
|
||||
@@ -2513,7 +2247,6 @@ impl Service {
|
||||
match shard.policy {
|
||||
PlacementPolicy::Single => {
|
||||
shard.intent.clear_secondary(scheduler);
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
}
|
||||
PlacementPolicy::Double(_n) => {
|
||||
// If our new attached node was a secondary, it no longer should be.
|
||||
@@ -2523,12 +2256,6 @@ impl Service {
|
||||
if let Some(old_attached) = old_attached {
|
||||
shard.intent.push_secondary(scheduler, old_attached);
|
||||
}
|
||||
|
||||
shard.intent.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
}
|
||||
PlacementPolicy::Secondary => {
|
||||
shard.intent.clear(scheduler);
|
||||
shard.intent.push_secondary(scheduler, migrate_req.node_id);
|
||||
}
|
||||
PlacementPolicy::Detached => {
|
||||
return Err(ApiError::BadRequest(anyhow::anyhow!(
|
||||
@@ -2536,6 +2263,9 @@ impl Service {
|
||||
)))
|
||||
}
|
||||
}
|
||||
shard
|
||||
.intent
|
||||
.set_attached(scheduler, Some(migrate_req.node_id));
|
||||
|
||||
tracing::info!("Migrating: new intent {:?}", shard.intent);
|
||||
shard.sequence = shard.sequence.next();
|
||||
@@ -2863,7 +2593,7 @@ impl Service {
|
||||
observed_loc.conf = None;
|
||||
}
|
||||
|
||||
if tenant_state.intent.demote_attached(config_req.node_id) {
|
||||
if tenant_state.intent.notify_offline(config_req.node_id) {
|
||||
tenant_state.sequence = tenant_state.sequence.next();
|
||||
match tenant_state.schedule(scheduler) {
|
||||
Err(e) => {
|
||||
@@ -2930,9 +2660,6 @@ impl Service {
|
||||
/// Helper for methods that will try and call pageserver APIs for
|
||||
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
|
||||
/// is attached somewhere.
|
||||
///
|
||||
/// TODO: this doesn't actually ensure attached unless the PlacementPolicy is
|
||||
/// an attached policy. We should error out if it isn't.
|
||||
fn ensure_attached_schedule(
|
||||
&self,
|
||||
mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>,
|
||||
|
||||
@@ -53,11 +53,8 @@ pub(crate) struct TenantState {
|
||||
pub(crate) sequence: Sequence,
|
||||
|
||||
// Latest generation number: next time we attach, increment this
|
||||
// and use the incremented number when attaching.
|
||||
//
|
||||
// None represents an incompletely onboarded tenant via the [`Service::location_config`]
|
||||
// API, where this tenant may only run in PlacementPolicy::Secondary.
|
||||
pub(crate) generation: Option<Generation>,
|
||||
// and use the incremented number when attaching
|
||||
pub(crate) generation: Generation,
|
||||
|
||||
// High level description of how the tenant should be set up. Provided
|
||||
// externally.
|
||||
@@ -184,13 +181,6 @@ impl IntentState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the last secondary node from the list of secondaries
|
||||
pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(node_id) = self.secondary.pop() {
|
||||
scheduler.node_dec_ref(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) {
|
||||
if let Some(old_attached) = self.attached.take() {
|
||||
scheduler.node_dec_ref(old_attached);
|
||||
@@ -218,13 +208,11 @@ impl IntentState {
|
||||
&self.secondary
|
||||
}
|
||||
|
||||
/// If the node is in use as the attached location, demote it into
|
||||
/// the list of secondary locations. This is used when a node goes offline,
|
||||
/// and we want to use a different node for attachment, but not permanently
|
||||
/// forget the location on the offline node.
|
||||
/// When a node goes offline, we update intents to avoid using it
|
||||
/// as their attached pageserver.
|
||||
///
|
||||
/// Returns true if a change was made
|
||||
pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool {
|
||||
pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
|
||||
if self.attached == Some(node_id) {
|
||||
// TODO: when scheduler starts tracking attached + secondary counts separately, we will
|
||||
// need to call into it here.
|
||||
@@ -327,7 +315,7 @@ pub(crate) struct ReconcileResult {
|
||||
pub(crate) result: Result<(), ReconcileError>,
|
||||
|
||||
pub(crate) tenant_shard_id: TenantShardId,
|
||||
pub(crate) generation: Option<Generation>,
|
||||
pub(crate) generation: Generation,
|
||||
pub(crate) observed: ObservedState,
|
||||
|
||||
/// Set [`TenantState::pending_compute_notification`] from this flag
|
||||
@@ -352,7 +340,7 @@ impl TenantState {
|
||||
tenant_shard_id,
|
||||
policy,
|
||||
intent: IntentState::default(),
|
||||
generation: Some(Generation::new(0)),
|
||||
generation: Generation::new(0),
|
||||
shard,
|
||||
observed: ObservedState::default(),
|
||||
config: TenantConfig::default(),
|
||||
@@ -450,16 +438,10 @@ impl TenantState {
|
||||
// more work on the same pageservers we're already using.
|
||||
let mut modified = false;
|
||||
|
||||
// Add/remove nodes to fulfil policy
|
||||
use PlacementPolicy::*;
|
||||
match self.policy {
|
||||
Single => {
|
||||
// Should have exactly one attached, and zero secondaries
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
|
||||
@@ -469,23 +451,6 @@ impl TenantState {
|
||||
}
|
||||
}
|
||||
Double(secondary_count) => {
|
||||
let retain_secondaries = if self.intent.attached.is_none()
|
||||
&& scheduler.node_preferred(&self.intent.secondary).is_some()
|
||||
{
|
||||
// If we have no attached, and one of the secondaries is elegible to be promoted, retain
|
||||
// one more secondary than we usually would, as one of them will become attached futher down this function.
|
||||
secondary_count + 1
|
||||
} else {
|
||||
secondary_count
|
||||
};
|
||||
|
||||
while self.intent.secondary.len() > retain_secondaries {
|
||||
// We have no particular preference for one secondary location over another: just
|
||||
// arbitrarily drop from the end
|
||||
self.intent.pop_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
// Should have exactly one attached, and N secondaries
|
||||
let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
|
||||
modified |= modified_attached;
|
||||
@@ -498,28 +463,15 @@ impl TenantState {
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Secondary => {
|
||||
if let Some(node_id) = self.intent.get_attached() {
|
||||
// Populate secondary by demoting the attached node
|
||||
self.intent.demote_attached(*node_id);
|
||||
modified = true;
|
||||
} else if self.intent.secondary.is_empty() {
|
||||
// Populate secondary by scheduling a fresh node
|
||||
let node_id = scheduler.schedule_shard(&[])?;
|
||||
self.intent.push_secondary(scheduler, node_id);
|
||||
modified = true;
|
||||
}
|
||||
while self.intent.secondary.len() > 1 {
|
||||
// We have no particular preference for one secondary location over another: just
|
||||
// arbitrarily drop from the end
|
||||
self.intent.pop_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
Detached => {
|
||||
// Never add locations in this mode
|
||||
if self.intent.get_attached().is_some() || !self.intent.get_secondary().is_empty() {
|
||||
self.intent.clear(scheduler);
|
||||
// Should have no attached or secondary pageservers
|
||||
if self.intent.attached.is_some() {
|
||||
self.intent.set_attached(scheduler, None);
|
||||
modified = true;
|
||||
}
|
||||
|
||||
if !self.intent.secondary.is_empty() {
|
||||
self.intent.clear_secondary(scheduler);
|
||||
modified = true;
|
||||
}
|
||||
}
|
||||
@@ -566,12 +518,7 @@ impl TenantState {
|
||||
|
||||
fn dirty(&self) -> bool {
|
||||
if let Some(node_id) = self.intent.attached {
|
||||
// Maybe panic: it is a severe bug if we try to attach while generation is null.
|
||||
let generation = self
|
||||
.generation
|
||||
.expect("Attempted to enter attached state without a generation");
|
||||
|
||||
let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
|
||||
let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
|
||||
match self.observed.locations.get(&node_id) {
|
||||
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
|
||||
Some(_) | None => {
|
||||
@@ -649,10 +596,6 @@ impl TenantState {
|
||||
// Reconcile already in flight for the current sequence?
|
||||
if let Some(handle) = &self.reconciler {
|
||||
if handle.sequence == self.sequence {
|
||||
tracing::info!(
|
||||
"Reconciliation already in progress for sequence {:?}",
|
||||
self.sequence,
|
||||
);
|
||||
return Some(ReconcilerWaiter {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
seq_wait: self.waiter.clone(),
|
||||
@@ -672,10 +615,6 @@ impl TenantState {
|
||||
return None;
|
||||
};
|
||||
|
||||
// Advance the sequence before spawning a reconciler, so that sequence waiters
|
||||
// can distinguish between before+after the reconcile completes.
|
||||
self.sequence = self.sequence.next();
|
||||
|
||||
let reconciler_cancel = cancel.child_token();
|
||||
let mut reconciler = Reconciler {
|
||||
tenant_shard_id: self.tenant_shard_id,
|
||||
@@ -777,17 +716,6 @@ impl TenantState {
|
||||
})
|
||||
}
|
||||
|
||||
/// Called when a ReconcileResult has been emitted and the service is updating
|
||||
/// our state: if the result is from a sequence >= my ReconcileHandle, then drop
|
||||
/// the handle to indicate there is no longer a reconciliation in progress.
|
||||
pub(crate) fn reconcile_complete(&mut self, sequence: Sequence) {
|
||||
if let Some(reconcile_handle) = &self.reconciler {
|
||||
if reconcile_handle.sequence <= sequence {
|
||||
self.reconciler = None;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we had any state at all referring to this node ID, drop it. Does not
|
||||
// attempt to reschedule.
|
||||
pub(crate) fn deref_node(&mut self, node_id: NodeId) {
|
||||
@@ -808,8 +736,13 @@ impl TenantState {
|
||||
shard_number: self.tenant_shard_id.shard_number.0 as i32,
|
||||
shard_count: self.tenant_shard_id.shard_count.literal() as i32,
|
||||
shard_stripe_size: self.shard.stripe_size.0 as i32,
|
||||
generation: self.generation.map(|g| g.into().unwrap_or(0) as i32),
|
||||
generation_pageserver: self.intent.get_attached().map(|n| n.0 as i64),
|
||||
generation: self.generation.into().unwrap_or(0) as i32,
|
||||
generation_pageserver: self
|
||||
.intent
|
||||
.get_attached()
|
||||
.map(|n| n.0 as i64)
|
||||
.unwrap_or(i64::MAX),
|
||||
|
||||
placement_policy: serde_json::to_string(&self.policy).unwrap(),
|
||||
config: serde_json::to_string(&self.config).unwrap(),
|
||||
splitting: SplitState::default(),
|
||||
@@ -872,10 +805,8 @@ pub(crate) mod tests {
|
||||
assert_ne!(attached_node_id, secondary_node_id);
|
||||
|
||||
// Notifying the attached node is offline should demote it to a secondary
|
||||
let changed = tenant_state.intent.demote_attached(attached_node_id);
|
||||
let changed = tenant_state.intent.notify_offline(attached_node_id);
|
||||
assert!(changed);
|
||||
assert!(tenant_state.intent.attached.is_none());
|
||||
assert_eq!(tenant_state.intent.secondary.len(), 2);
|
||||
|
||||
// Update the scheduler state to indicate the node is offline
|
||||
nodes.get_mut(&attached_node_id).unwrap().availability = NodeAvailability::Offline;
|
||||
|
||||
@@ -200,7 +200,7 @@ impl AttachmentService {
|
||||
"localhost",
|
||||
"-p",
|
||||
&format!("{}", self.postgres_port),
|
||||
DB_NAME,
|
||||
&DB_NAME,
|
||||
])
|
||||
.output()
|
||||
.await
|
||||
|
||||
@@ -605,7 +605,7 @@ impl Endpoint {
|
||||
let conn_str = self.connstr("cloud_admin", "postgres");
|
||||
println!("Starting postgres node at '{}'", conn_str);
|
||||
if create_test_user {
|
||||
let conn_str = self.connstr("test", "neondb");
|
||||
let conn_str = self.connstr("user", "neondb");
|
||||
println!("Also at '{}'", conn_str);
|
||||
}
|
||||
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
|
||||
|
||||
@@ -14,6 +14,7 @@ use byteorder::{BigEndian, ReadBytesExt};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::serde_as;
|
||||
use strum_macros;
|
||||
use utils::{
|
||||
completion,
|
||||
history_buffer::HistoryBufferWithDropCounter,
|
||||
@@ -1076,6 +1077,7 @@ impl PagestreamBeMessage {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::Buf;
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -6,6 +6,7 @@ use crate::{
|
||||
};
|
||||
use hex::FromHex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
|
||||
@@ -655,7 +656,10 @@ fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Ke
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::Hex;
|
||||
use std::str::FromStr;
|
||||
|
||||
use bincode;
|
||||
use utils::{id::TenantId, Hex};
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -623,7 +623,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use camino_tempfile::tempdir;
|
||||
use futures_util::Stream;
|
||||
use std::{collections::HashMap, io::Write};
|
||||
|
||||
async fn read_and_check_metadata(
|
||||
|
||||
@@ -1040,7 +1040,7 @@ mod tests {
|
||||
Some("test/prefix/"),
|
||||
Some("/test/prefix/"),
|
||||
];
|
||||
let expected_outputs = [
|
||||
let expected_outputs = vec![
|
||||
vec!["", "some/path", "some/path"],
|
||||
vec!["/", "/some/path", "/some/path"],
|
||||
vec![
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
// For details about authentication see docs/authentication.md
|
||||
|
||||
use arc_swap::ArcSwap;
|
||||
use serde;
|
||||
use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
@@ -4,9 +4,7 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
|
||||
///
|
||||
/// Can be cloned, moved and kept around in futures as "guard objects".
|
||||
#[derive(Clone)]
|
||||
pub struct Completion {
|
||||
_token: TaskTrackerToken,
|
||||
}
|
||||
pub struct Completion(TaskTrackerToken);
|
||||
|
||||
/// Barrier will wait until all clones of [`Completion`] have been dropped.
|
||||
#[derive(Clone)]
|
||||
@@ -51,5 +49,5 @@ pub fn channel() -> (Completion, Barrier) {
|
||||
tracker.close();
|
||||
|
||||
let token = tracker.token();
|
||||
(Completion { _token: token }, Barrier(tracker))
|
||||
(Completion(token), Barrier(tracker))
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ impl Generation {
|
||||
Self::Broken
|
||||
}
|
||||
|
||||
pub const fn new(v: u32) -> Self {
|
||||
pub fn new(v: u32) -> Self {
|
||||
Self::Valid(v)
|
||||
}
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
|
||||
use once_cell::sync::Lazy;
|
||||
use routerify::ext::RequestExt;
|
||||
use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
|
||||
use tracing::{debug, info, info_span, warn, Instrument};
|
||||
use tracing::{self, debug, info, info_span, warn, Instrument};
|
||||
|
||||
use std::future::Future;
|
||||
use std::str::FromStr;
|
||||
@@ -156,10 +156,6 @@ pub struct ChannelWriter {
|
||||
buffer: BytesMut,
|
||||
pub tx: mpsc::Sender<std::io::Result<Bytes>>,
|
||||
written: usize,
|
||||
/// Time spent waiting for the channel to make progress. It is not the same as time to upload a
|
||||
/// buffer because we cannot know anything about that, but this should allow us to understand
|
||||
/// the actual time taken without the time spent `std::thread::park`ed.
|
||||
wait_time: std::time::Duration,
|
||||
}
|
||||
|
||||
impl ChannelWriter {
|
||||
@@ -172,7 +168,6 @@ impl ChannelWriter {
|
||||
buffer: BytesMut::with_capacity(buf_len).split_off(buf_len / 2),
|
||||
tx,
|
||||
written: 0,
|
||||
wait_time: std::time::Duration::ZERO,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -185,8 +180,6 @@ impl ChannelWriter {
|
||||
tracing::trace!(n, "flushing");
|
||||
let ready = self.buffer.split().freeze();
|
||||
|
||||
let wait_started_at = std::time::Instant::now();
|
||||
|
||||
// not ideal to call from blocking code to block_on, but we are sure that this
|
||||
// operation does not spawn_blocking other tasks
|
||||
let res: Result<(), ()> = tokio::runtime::Handle::current().block_on(async {
|
||||
@@ -199,9 +192,6 @@ impl ChannelWriter {
|
||||
// sending it to the client.
|
||||
Ok(())
|
||||
});
|
||||
|
||||
self.wait_time += wait_started_at.elapsed();
|
||||
|
||||
if res.is_err() {
|
||||
return Err(std::io::ErrorKind::BrokenPipe.into());
|
||||
}
|
||||
@@ -212,10 +202,6 @@ impl ChannelWriter {
|
||||
pub fn flushed_bytes(&self) -> usize {
|
||||
self.written
|
||||
}
|
||||
|
||||
pub fn wait_time(&self) -> std::time::Duration {
|
||||
self.wait_time
|
||||
}
|
||||
}
|
||||
|
||||
impl std::io::Write for ChannelWriter {
|
||||
@@ -266,52 +252,22 @@ async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body
|
||||
|
||||
let span = info_span!("blocking");
|
||||
tokio::task::spawn_blocking(move || {
|
||||
// there are situations where we lose scraped metrics under load, try to gather some clues
|
||||
// since all nodes are queried this, keep the message count low.
|
||||
let spawned_at = std::time::Instant::now();
|
||||
|
||||
let _span = span.entered();
|
||||
|
||||
let metrics = metrics::gather();
|
||||
|
||||
let gathered_at = std::time::Instant::now();
|
||||
|
||||
let res = encoder
|
||||
.encode(&metrics, &mut writer)
|
||||
.and_then(|_| writer.flush().map_err(|e| e.into()));
|
||||
|
||||
// this instant is not when we finally got the full response sent, sending is done by hyper
|
||||
// in another task.
|
||||
let encoded_at = std::time::Instant::now();
|
||||
|
||||
let spawned_in = spawned_at - started_at;
|
||||
let collected_in = gathered_at - spawned_at;
|
||||
// remove the wait time here in case the tcp connection was clogged
|
||||
let encoded_in = encoded_at - gathered_at - writer.wait_time();
|
||||
let total = encoded_at - started_at;
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
tracing::info!(
|
||||
bytes = writer.flushed_bytes(),
|
||||
total_ms = total.as_millis(),
|
||||
spawning_ms = spawned_in.as_millis(),
|
||||
collection_ms = collected_in.as_millis(),
|
||||
encoding_ms = encoded_in.as_millis(),
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"responded /metrics"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
// there is a chance that this error is not the BrokenPipe we generate in the writer
|
||||
// for "closed connection", but it is highly unlikely.
|
||||
tracing::warn!(
|
||||
after_bytes = writer.flushed_bytes(),
|
||||
total_ms = total.as_millis(),
|
||||
spawning_ms = spawned_in.as_millis(),
|
||||
collection_ms = collected_in.as_millis(),
|
||||
encoding_ms = encoded_in.as_millis(),
|
||||
"failed to write out /metrics response: {e:?}"
|
||||
);
|
||||
tracing::warn!("failed to write out /metrics response: {e:#}");
|
||||
// semantics of this error are quite... unclear. we want to error the stream out to
|
||||
// abort the response to somehow notify the client that we failed.
|
||||
//
|
||||
|
||||
@@ -415,6 +415,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
use serde::ser::Serialize;
|
||||
use serde_assert::{Deserializer, Serializer, Token, Tokens};
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::cmp::{Eq, Ordering};
|
||||
use std::cmp::{Eq, Ordering, PartialOrd};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fmt::Debug;
|
||||
use std::mem;
|
||||
@@ -249,6 +249,7 @@ where
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
impl MonotonicCounter<i32> for i32 {
|
||||
fn cnt_advance(&mut self, val: i32) {
|
||||
|
||||
@@ -221,7 +221,7 @@ impl RcuWaitList {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -239,6 +239,7 @@ mod tests {
|
||||
use std::{
|
||||
convert::Infallible,
|
||||
pin::{pin, Pin},
|
||||
sync::atomic::{AtomicUsize, Ordering},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ use futures::future::BoxFuture;
|
||||
use futures::{Stream, StreamExt};
|
||||
use itertools::Itertools;
|
||||
use pin_project_lite::pin_project;
|
||||
use std::cmp::Ord;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::collections::VecDeque;
|
||||
use std::future::Future;
|
||||
|
||||
@@ -20,6 +20,7 @@ use std::num::NonZeroUsize;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -211,9 +212,9 @@ pub struct PageServerConf {
|
||||
|
||||
pub log_format: LogFormat,
|
||||
|
||||
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup or attach.
|
||||
///
|
||||
/// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
|
||||
/// Number of tenants which will be concurrently loaded from remote storage proactively on startup,
|
||||
/// does not limit tenants loaded in response to client I/O. A lower value implicitly deprioritizes
|
||||
/// loading such tenants, vs. other work in the system.
|
||||
pub concurrent_tenant_warmup: ConfigurableSemaphore,
|
||||
|
||||
/// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
|
||||
@@ -1202,7 +1203,10 @@ impl ConfigurableSemaphore {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{fs, num::NonZeroU32};
|
||||
use std::{
|
||||
fs,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
};
|
||||
|
||||
use camino_tempfile::{tempdir, Utf8TempDir};
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
use std::time::SystemTime;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
#[test]
|
||||
fn startup_collected_timeline_metrics_before_advancing() {
|
||||
|
||||
@@ -20,9 +20,10 @@ use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use thiserror::Error;
|
||||
use tokio;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
use tracing::{debug, error};
|
||||
use tracing::{self, debug, error};
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::TimelineId;
|
||||
@@ -725,7 +726,7 @@ mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::io::ErrorKind;
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use tracing::info;
|
||||
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
|
||||
@@ -734,7 +735,10 @@ mod test {
|
||||
use crate::{
|
||||
control_plane_client::RetryForeverError,
|
||||
repository::Key,
|
||||
tenant::{harness::TenantHarness, storage_layer::DeltaFileName},
|
||||
tenant::{
|
||||
harness::TenantHarness, remote_timeline_client::remote_timeline_path,
|
||||
storage_layer::DeltaFileName,
|
||||
},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
@@ -1157,8 +1161,13 @@ mod test {
|
||||
pub(crate) mod mock {
|
||||
use tracing::info;
|
||||
|
||||
use crate::tenant::remote_timeline_client::remote_layer_path;
|
||||
|
||||
use super::*;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
};
|
||||
|
||||
pub struct ConsumerState {
|
||||
rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
|
||||
|
||||
@@ -58,7 +58,6 @@ use utils::{completion, id::TimelineId};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
metrics::disk_usage_based_eviction::METRICS,
|
||||
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
|
||||
tenant::{
|
||||
self,
|
||||
@@ -66,6 +65,7 @@ use crate::{
|
||||
remote_timeline_client::LayerFileMetadata,
|
||||
secondary::SecondaryTenant,
|
||||
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
|
||||
Timeline,
|
||||
},
|
||||
};
|
||||
|
||||
@@ -409,23 +409,13 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
"running disk usage based eviction due to pressure"
|
||||
);
|
||||
|
||||
let (candidates, collection_time) = {
|
||||
let started_at = std::time::Instant::now();
|
||||
let candidates =
|
||||
match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
|
||||
EvictionCandidates::Cancelled => {
|
||||
return Ok(IterationOutcome::Cancelled);
|
||||
}
|
||||
EvictionCandidates::Finished(partitioned) => (partitioned, started_at.elapsed()),
|
||||
}
|
||||
};
|
||||
|
||||
METRICS.layers_collected.inc_by(candidates.len() as u64);
|
||||
|
||||
tracing::info!(
|
||||
elapsed_ms = collection_time.as_millis(),
|
||||
total_layers = candidates.len(),
|
||||
"collection completed"
|
||||
);
|
||||
EvictionCandidates::Finished(partitioned) => partitioned,
|
||||
};
|
||||
|
||||
// Debug-log the list of candidates
|
||||
let now = SystemTime::now();
|
||||
@@ -456,10 +446,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
|
||||
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
|
||||
|
||||
let (evicted_amount, usage_planned) =
|
||||
select_victims(&candidates, usage_pre).into_amount_and_planned();
|
||||
let selection = select_victims(&candidates, usage_pre);
|
||||
|
||||
METRICS.layers_selected.inc_by(evicted_amount as u64);
|
||||
let (evicted_amount, usage_planned) = selection.into_amount_and_planned();
|
||||
|
||||
// phase2: evict layers
|
||||
|
||||
@@ -488,15 +477,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
if let Some(next) = next {
|
||||
match next {
|
||||
Ok(Ok(file_size)) => {
|
||||
METRICS.layers_evicted.inc();
|
||||
usage_assumed.add_available_bytes(file_size);
|
||||
}
|
||||
Ok(Err((
|
||||
file_size,
|
||||
EvictionError::NotFound
|
||||
| EvictionError::Downloaded
|
||||
| EvictionError::Timeout,
|
||||
))) => {
|
||||
Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
|
||||
evictions_failed.file_sizes += file_size;
|
||||
evictions_failed.count += 1;
|
||||
}
|
||||
@@ -512,10 +495,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
|
||||
// calling again when consumed_all is fine as evicted is fused.
|
||||
let Some((_partition, candidate)) = evicted.next() else {
|
||||
if !consumed_all {
|
||||
tracing::info!("all evictions started, waiting");
|
||||
consumed_all = true;
|
||||
}
|
||||
consumed_all = true;
|
||||
continue;
|
||||
};
|
||||
|
||||
@@ -523,15 +503,11 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
EvictionLayer::Attached(layer) => {
|
||||
let file_size = layer.layer_desc().file_size;
|
||||
js.spawn(async move {
|
||||
// have a low eviction waiting timeout because our LRU calculations go stale fast;
|
||||
// also individual layer evictions could hang because of bugs and we do not want to
|
||||
// pause disk_usage_based_eviction for such.
|
||||
let timeout = std::time::Duration::from_secs(5);
|
||||
|
||||
match layer.evict_and_wait(timeout).await {
|
||||
Ok(()) => Ok(file_size),
|
||||
Err(e) => Err((file_size, e)),
|
||||
}
|
||||
layer
|
||||
.evict_and_wait()
|
||||
.await
|
||||
.map(|()| file_size)
|
||||
.map_err(|e| (file_size, e))
|
||||
});
|
||||
}
|
||||
EvictionLayer::Secondary(layer) => {
|
||||
@@ -553,30 +529,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
|
||||
(usage_assumed, evictions_failed)
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
let evict_layers = async move {
|
||||
let mut evict_layers = std::pin::pin!(evict_layers);
|
||||
|
||||
let maximum_expected = std::time::Duration::from_secs(10);
|
||||
|
||||
let res = tokio::time::timeout(maximum_expected, &mut evict_layers).await;
|
||||
let tuple = if let Ok(tuple) = res {
|
||||
tuple
|
||||
} else {
|
||||
let elapsed = started_at.elapsed();
|
||||
tracing::info!(elapsed_ms = elapsed.as_millis(), "still ongoing");
|
||||
evict_layers.await
|
||||
};
|
||||
|
||||
let elapsed = started_at.elapsed();
|
||||
tracing::info!(elapsed_ms = elapsed.as_millis(), "completed");
|
||||
tuple
|
||||
};
|
||||
|
||||
let evict_layers =
|
||||
evict_layers.instrument(tracing::info_span!("evict_layers", layers=%evicted_amount));
|
||||
|
||||
let (usage_assumed, evictions_failed) = tokio::select! {
|
||||
tuple = evict_layers => { tuple },
|
||||
_ = cancel.cancelled() => {
|
||||
@@ -811,8 +763,6 @@ async fn collect_eviction_candidates(
|
||||
eviction_order: EvictionOrder,
|
||||
cancel: &CancellationToken,
|
||||
) -> anyhow::Result<EvictionCandidates> {
|
||||
const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);
|
||||
|
||||
// get a snapshot of the list of tenants
|
||||
let tenants = tenant::mgr::list_tenants()
|
||||
.await
|
||||
@@ -841,8 +791,6 @@ async fn collect_eviction_candidates(
|
||||
continue;
|
||||
}
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
// collect layers from all timelines in this tenant
|
||||
//
|
||||
// If one of the timelines becomes `!is_active()` during the iteration,
|
||||
@@ -857,7 +805,6 @@ async fn collect_eviction_candidates(
|
||||
}
|
||||
let info = tl.get_local_layers_for_disk_usage_eviction().await;
|
||||
debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
|
||||
|
||||
tenant_candidates.extend(info.resident_layers.into_iter());
|
||||
max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));
|
||||
|
||||
@@ -923,25 +870,7 @@ async fn collect_eviction_candidates(
|
||||
(partition, candidate)
|
||||
});
|
||||
|
||||
METRICS
|
||||
.tenant_layer_count
|
||||
.observe(tenant_candidates.len() as f64);
|
||||
|
||||
candidates.extend(tenant_candidates);
|
||||
|
||||
let elapsed = started_at.elapsed();
|
||||
METRICS
|
||||
.tenant_collection_time
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if elapsed > LOG_DURATION_THRESHOLD {
|
||||
tracing::info!(
|
||||
tenant_id=%tenant.tenant_shard_id().tenant_id,
|
||||
shard_id=%tenant.tenant_shard_id().shard_slug(),
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"collection took longer than threshold"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Note: the same tenant ID might be hit twice, if it transitions from attached to
|
||||
@@ -956,11 +885,11 @@ async fn collect_eviction_candidates(
|
||||
},
|
||||
);
|
||||
|
||||
for tenant in secondary_tenants {
|
||||
for secondary_tenant in secondary_tenants {
|
||||
// for secondary tenants we use a sum of on_disk layers and already evicted layers. this is
|
||||
// to prevent repeated disk usage based evictions from completely draining less often
|
||||
// updating secondaries.
|
||||
let (mut layer_info, total_layers) = tenant.get_layers_for_eviction();
|
||||
let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction();
|
||||
|
||||
debug_assert!(
|
||||
total_layers >= layer_info.resident_layers.len(),
|
||||
@@ -968,8 +897,6 @@ async fn collect_eviction_candidates(
|
||||
layer_info.resident_layers.len()
|
||||
);
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
layer_info
|
||||
.resident_layers
|
||||
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
|
||||
@@ -991,27 +918,9 @@ async fn collect_eviction_candidates(
|
||||
)
|
||||
});
|
||||
|
||||
METRICS
|
||||
.tenant_layer_count
|
||||
.observe(tenant_candidates.len() as f64);
|
||||
candidates.extend(tenant_candidates);
|
||||
|
||||
tokio::task::yield_now().await;
|
||||
|
||||
let elapsed = started_at.elapsed();
|
||||
|
||||
METRICS
|
||||
.tenant_collection_time
|
||||
.observe(elapsed.as_secs_f64());
|
||||
|
||||
if elapsed > LOG_DURATION_THRESHOLD {
|
||||
tracing::info!(
|
||||
tenant_id=%tenant.tenant_shard_id().tenant_id,
|
||||
shard_id=%tenant.tenant_shard_id().shard_slug(),
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"collection took longer than threshold"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
|
||||
@@ -1088,6 +997,30 @@ impl<U: Usage> VictimSelection<U> {
|
||||
}
|
||||
}
|
||||
|
||||
struct TimelineKey(Arc<Timeline>);
|
||||
|
||||
impl PartialEq for TimelineKey {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
Arc::ptr_eq(&self.0, &other.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for TimelineKey {}
|
||||
|
||||
impl std::hash::Hash for TimelineKey {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
Arc::as_ptr(&self.0).hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TimelineKey {
|
||||
type Target = Timeline;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
/// A totally ordered f32 subset we can use with sorting functions.
|
||||
pub(crate) mod finite_f32 {
|
||||
|
||||
|
||||
@@ -579,12 +579,6 @@ paths:
|
||||
required: false
|
||||
schema:
|
||||
type: integer
|
||||
- name: lazy
|
||||
in: query
|
||||
required: false
|
||||
schema:
|
||||
type: boolean
|
||||
description: Set to true for attaches to queue up until activated by compute. Eager (false) is the default.
|
||||
put:
|
||||
description: |
|
||||
Configures a _tenant location_, that is how a particular pageserver handles
|
||||
|
||||
@@ -816,7 +816,13 @@ async fn tenant_attach_handler(
|
||||
|
||||
let tenant = state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_shard_id, location_conf, None, SpawnMode::Eager, &ctx)
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
location_conf,
|
||||
None,
|
||||
SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
let Some(tenant) = tenant else {
|
||||
@@ -1412,7 +1418,6 @@ async fn put_tenant_location_config_handler(
|
||||
|
||||
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
|
||||
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
|
||||
let lazy = parse_query_param(&request, "lazy")?.unwrap_or(false);
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
@@ -1443,17 +1448,15 @@ async fn put_tenant_location_config_handler(
|
||||
let location_conf =
|
||||
LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
|
||||
|
||||
// lazy==true queues up for activation or jumps the queue like normal when a compute connects,
|
||||
// similar to at startup ordering.
|
||||
let spawn_mode = if lazy {
|
||||
tenant::SpawnMode::Lazy
|
||||
} else {
|
||||
tenant::SpawnMode::Eager
|
||||
};
|
||||
|
||||
let attached = state
|
||||
.tenant_manager
|
||||
.upsert_location(tenant_shard_id, location_conf, flush, spawn_mode, &ctx)
|
||||
.upsert_location(
|
||||
tenant_shard_id,
|
||||
location_conf,
|
||||
flush,
|
||||
tenant::SpawnMode::Normal,
|
||||
&ctx,
|
||||
)
|
||||
.await?
|
||||
.is_some();
|
||||
|
||||
|
||||
@@ -1915,16 +1915,17 @@ impl Drop for TimelineMetrics {
|
||||
let tenant_id = &self.tenant_id;
|
||||
let timeline_id = &self.timeline_id;
|
||||
let shard_id = &self.shard_id;
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
{
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ =
|
||||
RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = metric.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
}
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
|
||||
|
||||
self.evictions_with_low_residence_duration
|
||||
.write()
|
||||
@@ -2473,64 +2474,6 @@ pub(crate) mod tenant_throttling {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) mod disk_usage_based_eviction {
|
||||
use super::*;
|
||||
|
||||
pub(crate) struct Metrics {
|
||||
pub(crate) tenant_collection_time: Histogram,
|
||||
pub(crate) tenant_layer_count: Histogram,
|
||||
pub(crate) layers_collected: IntCounter,
|
||||
pub(crate) layers_selected: IntCounter,
|
||||
pub(crate) layers_evicted: IntCounter,
|
||||
}
|
||||
|
||||
impl Default for Metrics {
|
||||
fn default() -> Self {
|
||||
let tenant_collection_time = register_histogram!(
|
||||
"pageserver_disk_usage_based_eviction_tenant_collection_seconds",
|
||||
"Time spent collecting layers from a tenant -- not normalized by collected layer amount",
|
||||
vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0]
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let tenant_layer_count = register_histogram!(
|
||||
"pageserver_disk_usage_based_eviction_tenant_collected_layers",
|
||||
"Amount of layers gathered from a tenant",
|
||||
vec![5.0, 50.0, 500.0, 5000.0, 50000.0]
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let layers_collected = register_int_counter!(
|
||||
"pageserver_disk_usage_based_eviction_collected_layers_total",
|
||||
"Amount of layers collected"
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let layers_selected = register_int_counter!(
|
||||
"pageserver_disk_usage_based_eviction_select_layers_total",
|
||||
"Amount of layers selected"
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let layers_evicted = register_int_counter!(
|
||||
"pageserver_disk_usage_based_eviction_evicted_layers_total",
|
||||
"Amount of layers successfully evicted"
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
Self {
|
||||
tenant_collection_time,
|
||||
tenant_layer_count,
|
||||
layers_collected,
|
||||
layers_selected,
|
||||
layers_evicted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static METRICS: Lazy<Metrics> = Lazy::new(Metrics::default);
|
||||
}
|
||||
|
||||
pub fn preinitialize_metrics() {
|
||||
// Python tests need these and on some we do alerting.
|
||||
//
|
||||
@@ -2565,7 +2508,6 @@ pub fn preinitialize_metrics() {
|
||||
Lazy::force(&TENANT_MANAGER);
|
||||
|
||||
Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
|
||||
Lazy::force(&disk_usage_based_eviction::METRICS);
|
||||
|
||||
// countervecs
|
||||
[&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
|
||||
|
||||
@@ -73,6 +73,7 @@
|
||||
|
||||
use std::{
|
||||
collections::{hash_map::Entry, HashMap},
|
||||
convert::TryInto,
|
||||
sync::{
|
||||
atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
|
||||
Arc, Weak,
|
||||
@@ -261,9 +262,7 @@ pub struct PageCache {
|
||||
size_metrics: &'static PageCacheSizeMetrics,
|
||||
}
|
||||
|
||||
struct PinnedSlotsPermit {
|
||||
_permit: tokio::sync::OwnedSemaphorePermit,
|
||||
}
|
||||
struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
|
||||
|
||||
///
|
||||
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
|
||||
@@ -559,9 +558,9 @@ impl PageCache {
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(res) => Ok(PinnedSlotsPermit {
|
||||
_permit: res.expect("this semaphore is never closed"),
|
||||
}),
|
||||
Ok(res) => Ok(PinnedSlotsPermit(
|
||||
res.expect("this semaphore is never closed"),
|
||||
)),
|
||||
Err(_timeout) => {
|
||||
crate::metrics::page_cache_errors_inc(
|
||||
crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
|
||||
|
||||
@@ -27,7 +27,7 @@ use pageserver_api::models::{
|
||||
};
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use pageserver_api::shard::ShardNumber;
|
||||
use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
|
||||
use pq_proto::framed::ConnectionError;
|
||||
use pq_proto::FeStartupPacket;
|
||||
use pq_proto::{BeMessage, FeMessage, RowDescriptor};
|
||||
@@ -44,6 +44,7 @@ use tokio::io::AsyncWriteExt;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tokio_util::io::StreamReader;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::field;
|
||||
use tracing::*;
|
||||
use utils::id::ConnectionId;
|
||||
use utils::sync::gate::GateGuard;
|
||||
@@ -1114,10 +1115,7 @@ impl PageServerHandler {
|
||||
ctx: &RequestContext,
|
||||
) -> Result<PagestreamBeMessage, PageStreamError> {
|
||||
let timeline = match self.get_cached_timeline_for_page(req) {
|
||||
Ok(tl) => {
|
||||
set_tracing_field_shard_id(tl);
|
||||
tl
|
||||
}
|
||||
Ok(tl) => tl,
|
||||
Err(key) => {
|
||||
match self
|
||||
.load_timeline_for_page(tenant_id, timeline_id, key)
|
||||
@@ -1142,6 +1140,9 @@ impl PageServerHandler {
|
||||
}
|
||||
};
|
||||
|
||||
// load_timeline_for_page sets shard_id, but get_cached_timeline_for_page doesn't
|
||||
set_tracing_field_shard_id(timeline);
|
||||
|
||||
let _timer = timeline
|
||||
.query_metrics
|
||||
.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
|
||||
|
||||
@@ -37,6 +37,7 @@ impl Value {
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
macro_rules! roundtrip {
|
||||
|
||||
@@ -109,6 +109,7 @@ pub use pageserver_api::models::TenantState;
|
||||
use tokio::sync::Semaphore;
|
||||
|
||||
static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
|
||||
use toml_edit;
|
||||
use utils::{
|
||||
crashsafe,
|
||||
generation::Generation,
|
||||
@@ -226,11 +227,7 @@ pub(crate) struct TenantPreload {
|
||||
/// When we spawn a tenant, there is a special mode for tenant creation that
|
||||
/// avoids trying to read anything from remote storage.
|
||||
pub(crate) enum SpawnMode {
|
||||
/// Activate as soon as possible
|
||||
Eager,
|
||||
/// Lazy activation in the background, with the option to skip the queue if the need comes up
|
||||
Lazy,
|
||||
/// Tenant has been created during the lifetime of this process
|
||||
Normal,
|
||||
Create,
|
||||
}
|
||||
|
||||
@@ -703,37 +700,41 @@ impl Tenant {
|
||||
.and_then(|x| x.initial_tenant_load_remote.take());
|
||||
|
||||
enum AttachType<'a> {
|
||||
/// We are attaching this tenant lazily in the background.
|
||||
Warmup {
|
||||
_permit: tokio::sync::SemaphorePermit<'a>,
|
||||
during_startup: bool
|
||||
},
|
||||
/// We are attaching this tenant as soon as we can, because for example an
|
||||
/// endpoint tried to access it.
|
||||
// During pageserver startup, we are attaching this tenant lazily in the background
|
||||
Warmup(tokio::sync::SemaphorePermit<'a>),
|
||||
// During pageserver startup, we are attaching this tenant as soon as we can,
|
||||
// because a client tried to access it.
|
||||
OnDemand,
|
||||
/// During normal operations after startup, we are attaching a tenant, and
|
||||
/// eager attach was requested.
|
||||
// During normal operations after startup, we are attaching a tenant.
|
||||
Normal,
|
||||
}
|
||||
|
||||
let attach_type = if matches!(mode, SpawnMode::Lazy) {
|
||||
// Before doing any I/O, wait for at least one of:
|
||||
// - A client attempting to access to this tenant (on-demand loading)
|
||||
// - A permit becoming available in the warmup semaphore (background warmup)
|
||||
|
||||
// Before doing any I/O, wait for either or:
|
||||
// - A client to attempt to access to this tenant (on-demand loading)
|
||||
// - A permit to become available in the warmup semaphore (background warmup)
|
||||
//
|
||||
// Some-ness of init_order is how we know if we're attaching during startup or later
|
||||
// in process lifetime.
|
||||
let attach_type = if init_order.is_some() {
|
||||
tokio::select!(
|
||||
permit = tenant_clone.activate_now_sem.acquire() => {
|
||||
let _ = permit.expect("activate_now_sem is never closed");
|
||||
_ = tenant_clone.activate_now_sem.acquire() => {
|
||||
tracing::info!("Activating tenant (on-demand)");
|
||||
AttachType::OnDemand
|
||||
},
|
||||
permit = conf.concurrent_tenant_warmup.inner().acquire() => {
|
||||
let _permit = permit.expect("concurrent_tenant_warmup semaphore is never closed");
|
||||
tracing::info!("Activating tenant (warmup)");
|
||||
AttachType::Warmup {
|
||||
_permit,
|
||||
during_startup: init_order.is_some()
|
||||
permit_result = conf.concurrent_tenant_warmup.inner().acquire() => {
|
||||
match permit_result {
|
||||
Ok(p) => {
|
||||
tracing::info!("Activating tenant (warmup)");
|
||||
AttachType::Warmup(p)
|
||||
}
|
||||
Err(_) => {
|
||||
// This is unexpected: the warmup semaphore should stay alive
|
||||
// for the lifetime of init_order. Log a warning and proceed.
|
||||
tracing::warn!("warmup_limit semaphore unexpectedly closed");
|
||||
AttachType::Normal
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
_ = tenant_clone.cancel.cancelled() => {
|
||||
// This is safe, but should be pretty rare: it is interesting if a tenant
|
||||
@@ -748,8 +749,6 @@ impl Tenant {
|
||||
},
|
||||
)
|
||||
} else {
|
||||
// SpawnMode::{Create,Eager} always cause jumping ahead of the
|
||||
// concurrent_tenant_warmup queue
|
||||
AttachType::Normal
|
||||
};
|
||||
|
||||
@@ -757,7 +756,7 @@ impl Tenant {
|
||||
(SpawnMode::Create, _) => {
|
||||
None
|
||||
},
|
||||
(SpawnMode::Eager | SpawnMode::Lazy, Some(remote_storage)) => {
|
||||
(SpawnMode::Normal, Some(remote_storage)) => {
|
||||
let _preload_timer = TENANT.preload.start_timer();
|
||||
let res = tenant_clone
|
||||
.preload(remote_storage, task_mgr::shutdown_token())
|
||||
@@ -770,7 +769,7 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
}
|
||||
(_, None) => {
|
||||
(SpawnMode::Normal, None) => {
|
||||
let _preload_timer = TENANT.preload.start_timer();
|
||||
None
|
||||
}
|
||||
@@ -829,7 +828,7 @@ impl Tenant {
|
||||
let attached = {
|
||||
let _attach_timer = match mode {
|
||||
SpawnMode::Create => None,
|
||||
SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
|
||||
SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
|
||||
};
|
||||
tenant_clone.attach(preload, mode, &ctx).await
|
||||
};
|
||||
@@ -851,7 +850,7 @@ impl Tenant {
|
||||
// It also prevents the warmup proccess competing with the concurrency limit on
|
||||
// logical size calculations: if logical size calculation semaphore is saturated,
|
||||
// then warmup will wait for that before proceeding to the next tenant.
|
||||
if matches!(attach_type, AttachType::Warmup { during_startup: true, .. }) {
|
||||
if let AttachType::Warmup(_permit) = attach_type {
|
||||
let mut futs: FuturesUnordered<_> = tenant_clone.timelines.lock().unwrap().values().cloned().map(|t| t.await_initial_logical_size()).collect();
|
||||
tracing::info!("Waiting for initial logical sizes while warming up...");
|
||||
while futs.next().await.is_some() {}
|
||||
@@ -924,7 +923,7 @@ impl Tenant {
|
||||
deleting: false,
|
||||
timelines: HashMap::new(),
|
||||
},
|
||||
(None, _) => {
|
||||
(None, SpawnMode::Normal) => {
|
||||
anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
|
||||
}
|
||||
};
|
||||
@@ -2383,7 +2382,7 @@ impl Tenant {
|
||||
self.tenant_shard_id,
|
||||
self.generation,
|
||||
self.shard_identity,
|
||||
self.walredo_mgr.clone(),
|
||||
self.walredo_mgr.as_ref().map(Arc::clone),
|
||||
resources,
|
||||
pg_version,
|
||||
state,
|
||||
@@ -3592,18 +3591,25 @@ pub async fn dump_layerfile_from_path(
|
||||
#[cfg(test)]
|
||||
pub(crate) mod harness {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::Utf8PathBuf;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::ShardParameters;
|
||||
use pageserver_api::shard::ShardIndex;
|
||||
use std::fs;
|
||||
use std::sync::Arc;
|
||||
use utils::logging;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::walredo::apply_neon;
|
||||
use crate::{repository::Key, walrecord::NeonWalRecord};
|
||||
use crate::{
|
||||
config::PageServerConf, repository::Key, tenant::Tenant, walrecord::NeonWalRecord,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::config::{TenantConf, TenantConfOpt};
|
||||
use hex_literal::hex;
|
||||
use utils::id::TenantId;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
pub const TIMELINE_ID: TimelineId =
|
||||
TimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
@@ -3763,7 +3769,7 @@ pub(crate) mod harness {
|
||||
let preload = tenant
|
||||
.preload(&self.remote_storage, CancellationToken::new())
|
||||
.await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
|
||||
tenant.attach(Some(preload), SpawnMode::Normal, ctx).await?;
|
||||
|
||||
tenant.state.send_replace(TenantState::Active);
|
||||
for timeline in tenant.timelines.lock().unwrap().values() {
|
||||
@@ -3832,8 +3838,10 @@ mod tests {
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use rand::{thread_rng, Rng};
|
||||
use tokio_util::sync::CancellationToken;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
|
||||
@@ -52,10 +52,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
|
||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
// The default limit on WAL lag should be set to avoid causing disconnects under high throughput
|
||||
// scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for
|
||||
// throughputs up to 1GiB/s per timeline.
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024;
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||
pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
|
||||
|
||||
pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
|
||||
|
||||
@@ -420,7 +420,7 @@ impl DeleteTenantFlow {
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(preload, super::SpawnMode::Eager, ctx)
|
||||
.attach(preload, super::SpawnMode::Normal, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
use byteorder::{ReadBytesExt, BE};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use either::Either;
|
||||
use hex;
|
||||
use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
@@ -699,6 +700,8 @@ impl<const L: usize> BuildNode<L> {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReaderRef};
|
||||
use rand::Rng;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
@@ -300,7 +300,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::context::DownloadBehavior;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::block_io::BlockReaderRef;
|
||||
use crate::tenant::block_io::{BlockCursor, BlockReaderRef};
|
||||
use rand::{thread_rng, RngCore};
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -595,7 +595,7 @@ pub async fn init_tenant_mgr(
|
||||
shard_identity,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Lazy,
|
||||
SpawnMode::Normal,
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
@@ -1106,9 +1106,9 @@ impl TenantManager {
|
||||
|
||||
// Edge case: if we were called with SpawnMode::Create, but a Tenant already existed, then
|
||||
// the caller thinks they're creating but the tenant already existed. We must switch to
|
||||
// Eager mode so that when starting this Tenant we properly probe remote storage for timelines,
|
||||
// Normal mode so that when starting this Tenant we properly probe remote storage for timelines,
|
||||
// rather than assuming it to be empty.
|
||||
spawn_mode = SpawnMode::Eager;
|
||||
spawn_mode = SpawnMode::Normal;
|
||||
}
|
||||
Some(TenantSlot::Secondary(state)) => {
|
||||
info!("Shutting down secondary tenant");
|
||||
@@ -1300,7 +1300,7 @@ impl TenantManager {
|
||||
shard_identity,
|
||||
None,
|
||||
self.tenants,
|
||||
SpawnMode::Eager,
|
||||
SpawnMode::Normal,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
@@ -1521,7 +1521,7 @@ impl TenantManager {
|
||||
*child_shard,
|
||||
child_location_conf,
|
||||
None,
|
||||
SpawnMode::Eager,
|
||||
SpawnMode::Normal,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
@@ -2064,7 +2064,7 @@ pub(crate) async fn load_tenant(
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Eager,
|
||||
SpawnMode::Normal,
|
||||
ctx,
|
||||
)
|
||||
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
|
||||
@@ -2648,7 +2648,7 @@ pub(crate) async fn immediate_gc(
|
||||
|
||||
let tenant = guard
|
||||
.get(&tenant_shard_id)
|
||||
.cloned()
|
||||
.map(Arc::clone)
|
||||
.with_context(|| format!("tenant {tenant_shard_id}"))
|
||||
.map_err(|e| ApiError::NotFound(e.into()))?;
|
||||
|
||||
|
||||
@@ -1791,12 +1791,14 @@ mod tests {
|
||||
context::RequestContext,
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
Tenant, Timeline,
|
||||
storage_layer::Layer,
|
||||
Generation, Tenant, Timeline,
|
||||
},
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
use std::collections::HashSet;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
|
||||
format!("contents for {name}").into()
|
||||
|
||||
@@ -161,7 +161,7 @@ pub async fn download_layer_file<'a>(
|
||||
|
||||
const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
pub(crate) fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||
pub fn is_temp_download_file(path: &Utf8Path) -> bool {
|
||||
let extension = path.extension();
|
||||
match extension {
|
||||
Some(TEMP_DOWNLOAD_EXTENSION) => true,
|
||||
|
||||
@@ -32,7 +32,7 @@ use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::instrument;
|
||||
use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
|
||||
use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};
|
||||
|
||||
enum DownloadCommand {
|
||||
Download(TenantShardId),
|
||||
@@ -121,10 +121,6 @@ impl SecondaryTenant {
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
|
||||
self.tenant_shard_id
|
||||
}
|
||||
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
self.cancel.cancel();
|
||||
|
||||
@@ -168,17 +164,16 @@ impl SecondaryTenant {
|
||||
self.detail.lock().unwrap().get_layers_for_eviction(self)
|
||||
}
|
||||
|
||||
/// Cancellation safe, but on cancellation the eviction will go through
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
|
||||
pub(crate) async fn evict_layer(
|
||||
self: &Arc<Self>,
|
||||
&self,
|
||||
conf: &PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
name: LayerFileName,
|
||||
) {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let guard = match self.gate.enter() {
|
||||
let _guard = match self.gate.enter() {
|
||||
Ok(g) => g,
|
||||
Err(_) => {
|
||||
tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
|
||||
@@ -192,57 +187,35 @@ impl SecondaryTenant {
|
||||
.timeline_path(&self.tenant_shard_id, &timeline_id)
|
||||
.join(name.file_name());
|
||||
|
||||
let this = self.clone();
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
tokio::fs::remove_file(path)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("Deleting layer during eviction");
|
||||
|
||||
// spawn it to be cancellation safe
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _guard = guard;
|
||||
// We tolerate ENOENT, because between planning eviction and executing
|
||||
// it, the secondary downloader could have seen an updated heatmap that
|
||||
// resulted in a layer being deleted.
|
||||
// Other local I/O errors are process-fatal: these should never happen.
|
||||
let deleted = std::fs::remove_file(path);
|
||||
|
||||
let not_found = deleted
|
||||
.as_ref()
|
||||
.is_err_and(|x| x.kind() == std::io::ErrorKind::NotFound);
|
||||
|
||||
let deleted = if not_found {
|
||||
false
|
||||
} else {
|
||||
deleted
|
||||
.map(|()| true)
|
||||
.fatal_err("Deleting layer during eviction")
|
||||
};
|
||||
|
||||
if !deleted {
|
||||
// skip updating accounting and putting perhaps later timestamp
|
||||
return;
|
||||
}
|
||||
|
||||
// Update the timeline's state. This does not have to be synchronized with
|
||||
// the download process, because:
|
||||
// - If downloader is racing with us to remove a file (e.g. because it is
|
||||
// removed from heatmap), then our mutual .remove() operations will both
|
||||
// succeed.
|
||||
// - If downloader is racing with us to download the object (this would require
|
||||
// multiple eviction iterations to race with multiple download iterations), then
|
||||
// if we remove it from the state, the worst that happens is the downloader
|
||||
// downloads it again before re-inserting, or we delete the file but it remains
|
||||
// in the state map (in which case it will be downloaded if this secondary
|
||||
// tenant transitions to attached and tries to access it)
|
||||
//
|
||||
// The important assumption here is that the secondary timeline state does not
|
||||
// have to 100% match what is on disk, because it's a best-effort warming
|
||||
// of the cache.
|
||||
let mut detail = this.detail.lock().unwrap();
|
||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
||||
timeline_detail.on_disk_layers.remove(&name);
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
}
|
||||
})
|
||||
.await
|
||||
.expect("secondary eviction should not have panicked");
|
||||
// Update the timeline's state. This does not have to be synchronized with
|
||||
// the download process, because:
|
||||
// - If downloader is racing with us to remove a file (e.g. because it is
|
||||
// removed from heatmap), then our mutual .remove() operations will both
|
||||
// succeed.
|
||||
// - If downloader is racing with us to download the object (this would require
|
||||
// multiple eviction iterations to race with multiple download iterations), then
|
||||
// if we remove it from the state, the worst that happens is the downloader
|
||||
// downloads it again before re-inserting, or we delete the file but it remains
|
||||
// in the state map (in which case it will be downloaded if this secondary
|
||||
// tenant transitions to attached and tries to access it)
|
||||
//
|
||||
// The important assumption here is that the secondary timeline state does not
|
||||
// have to 100% match what is on disk, because it's a best-effort warming
|
||||
// of the cache.
|
||||
let mut detail = self.detail.lock().unwrap();
|
||||
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
|
||||
timeline_detail.on_disk_layers.remove(&name);
|
||||
timeline_detail.evicted_at.insert(name, now);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -16,8 +16,7 @@ use crate::{
|
||||
config::SecondaryLocationConfig,
|
||||
debug_assert_current_span_has_tenant_and_timeline_id,
|
||||
remote_timeline_client::{
|
||||
index::LayerFileMetadata, is_temp_download_file, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
|
||||
},
|
||||
span::debug_assert_current_span_has_tenant_id,
|
||||
storage_layer::LayerFileName,
|
||||
@@ -789,7 +788,7 @@ async fn init_timeline_state(
|
||||
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
|
||||
warn!(path=?dentry.path(), "found legacy metadata file, these should have been removed in load_tenant_config");
|
||||
continue;
|
||||
} else if crate::is_temporary(&file_path) || is_temp_download_file(&file_path) {
|
||||
} else if crate::is_temporary(&file_path) {
|
||||
// Temporary files are frequently left behind from restarting during downloads
|
||||
tracing::info!("Cleaning up temporary file {file_path}");
|
||||
if let Err(e) = tokio::fs::remove_file(&file_path)
|
||||
|
||||
@@ -18,6 +18,7 @@ use crate::{
|
||||
};
|
||||
|
||||
use futures::Future;
|
||||
use md5;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::Rng;
|
||||
use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};
|
||||
|
||||
@@ -72,7 +72,7 @@ where
|
||||
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
|
||||
/// call, to collect more records.
|
||||
///
|
||||
#[derive(Debug, Default)]
|
||||
#[derive(Debug)]
|
||||
pub struct ValueReconstructState {
|
||||
pub records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
|
||||
@@ -43,6 +43,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use hex;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::LayerAccessKind;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
|
||||
@@ -8,7 +8,7 @@ use pageserver_api::shard::ShardIndex;
|
||||
use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::time::{Duration, SystemTime};
|
||||
use std::time::SystemTime;
|
||||
use tracing::Instrument;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::heavier_once_cell;
|
||||
@@ -208,15 +208,10 @@ impl Layer {
|
||||
/// If for a bad luck or blocking of the executor, we miss the actual eviction and the layer is
|
||||
/// re-downloaded, [`EvictionError::Downloaded`] is returned.
|
||||
///
|
||||
/// Timeout is mandatory, because waiting for eviction is only needed for our tests; eviction
|
||||
/// will happen regardless the future returned by this method completing unless there is a
|
||||
/// read access (currently including [`Layer::keep_resident`]) before eviction gets to
|
||||
/// complete.
|
||||
///
|
||||
/// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
|
||||
/// of download-evict cycle on retry.
|
||||
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
|
||||
self.0.evict_and_wait(timeout).await
|
||||
pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
|
||||
self.0.evict_and_wait().await
|
||||
}
|
||||
|
||||
/// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
|
||||
@@ -368,7 +363,7 @@ impl Layer {
|
||||
///
|
||||
/// Does not start local deletion, use [`Self::delete_on_drop`] for that
|
||||
/// separatedly.
|
||||
#[cfg(any(feature = "testing", test))]
|
||||
#[cfg(feature = "testing")]
|
||||
pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
|
||||
let mut rx = self.0.status.subscribe();
|
||||
|
||||
@@ -637,7 +632,7 @@ impl LayerInner {
|
||||
|
||||
/// Cancellation safe, however dropping the future and calling this method again might result
|
||||
/// in a new attempt to evict OR join the previously started attempt.
|
||||
pub(crate) async fn evict_and_wait(&self, timeout: Duration) -> Result<(), EvictionError> {
|
||||
pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
|
||||
use tokio::sync::broadcast::error::RecvError;
|
||||
|
||||
assert!(self.have_remote_client);
|
||||
@@ -657,22 +652,16 @@ impl LayerInner {
|
||||
if strong.is_some() {
|
||||
// drop the DownloadedLayer outside of the holding the guard
|
||||
drop(strong);
|
||||
|
||||
// idea here is that only one evicter should ever get to witness a strong reference,
|
||||
// which means whenever get_or_maybe_download upgrades a weak, it must mark up a
|
||||
// cancelled eviction and signal us, like it currently does.
|
||||
//
|
||||
// a second concurrent evict_and_wait will not see a strong reference.
|
||||
LAYER_IMPL_METRICS.inc_started_evictions();
|
||||
}
|
||||
|
||||
match tokio::time::timeout(timeout, rx.recv()).await {
|
||||
Ok(Ok(Status::Evicted)) => Ok(()),
|
||||
Ok(Ok(Status::Downloaded)) => Err(EvictionError::Downloaded),
|
||||
Ok(Err(RecvError::Closed)) => {
|
||||
match rx.recv().await {
|
||||
Ok(Status::Evicted) => Ok(()),
|
||||
Ok(Status::Downloaded) => Err(EvictionError::Downloaded),
|
||||
Err(RecvError::Closed) => {
|
||||
unreachable!("sender cannot be dropped while we are in &self method")
|
||||
}
|
||||
Ok(Err(RecvError::Lagged(_))) => {
|
||||
Err(RecvError::Lagged(_)) => {
|
||||
// this is quite unlikely, but we are blocking a lot in the async context, so
|
||||
// we might be missing this because we are stuck on a LIFO slot on a thread
|
||||
// which is busy blocking for a 1TB database create_image_layers.
|
||||
@@ -685,7 +674,6 @@ impl LayerInner {
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
Err(_timeout) => Err(EvictionError::Timeout),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1207,9 +1195,6 @@ pub(crate) enum EvictionError {
|
||||
/// Evictions must always lose to downloads in races, and this time it happened.
|
||||
#[error("layer was downloaded instead")]
|
||||
Downloaded,
|
||||
|
||||
#[error("eviction did not happen within timeout")]
|
||||
Timeout,
|
||||
}
|
||||
|
||||
/// Error internal to the [`LayerInner::get_or_maybe_download`]
|
||||
|
||||
@@ -1,173 +1,13 @@
|
||||
use futures::StreamExt;
|
||||
use pageserver_api::key::CONTROLFILE_KEY;
|
||||
use tokio::task::JoinSet;
|
||||
use tracing::Instrument;
|
||||
use utils::{
|
||||
completion::{self, Completion},
|
||||
id::TimelineId,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::{context::DownloadBehavior, task_mgr::BACKGROUND_RUNTIME};
|
||||
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
|
||||
|
||||
/// Used in tests to advance a future to wanted await point, and not futher.
|
||||
const ADVANCE: std::time::Duration = std::time::Duration::from_secs(3600);
|
||||
|
||||
/// Used in tests to indicate forever long timeout; has to be longer than the amount of ADVANCE
|
||||
/// timeout uses to advance futures.
|
||||
const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_secs() * 24 * 7);
|
||||
|
||||
/// Demonstrate the API and resident -> evicted -> resident -> deleted transitions.
|
||||
#[tokio::test]
|
||||
async fn smoke_test() {
|
||||
let handle = BACKGROUND_RUNTIME.handle();
|
||||
|
||||
let h = TenantHarness::create("smoke_test").unwrap();
|
||||
let span = h.span();
|
||||
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
|
||||
let (tenant, _) = h.load().await;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
|
||||
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let layer = {
|
||||
let mut layers = {
|
||||
let layers = timeline.layers.read().await;
|
||||
layers.resident_layers().collect::<Vec<_>>().await
|
||||
};
|
||||
|
||||
assert_eq!(layers.len(), 1);
|
||||
|
||||
layers.swap_remove(0)
|
||||
};
|
||||
|
||||
// all layers created at pageserver are like `layer`, initialized with strong
|
||||
// Arc<DownloadedLayer>.
|
||||
|
||||
let img_before = {
|
||||
let mut data = ValueReconstructState::default();
|
||||
layer
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
data.img
|
||||
.take()
|
||||
.expect("tenant harness writes the control file")
|
||||
};
|
||||
|
||||
// important part is evicting the layer, which can be done when there are no more ResidentLayer
|
||||
// instances -- there currently are none, only two `Layer` values, one in the layermap and on
|
||||
// in scope.
|
||||
layer.evict_and_wait(FOREVER).await.unwrap();
|
||||
|
||||
// double-evict returns an error, which is valid if both eviction_task and disk usage based
|
||||
// eviction would both evict the same layer at the same time.
|
||||
|
||||
let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
|
||||
assert!(matches!(e, EvictionError::NotFound));
|
||||
|
||||
// on accesses when the layer is evicted, it will automatically be downloaded.
|
||||
let img_after = {
|
||||
let mut data = ValueReconstructState::default();
|
||||
layer
|
||||
.get_value_reconstruct_data(CONTROLFILE_KEY, Lsn(0x10)..Lsn(0x11), &mut data, &ctx)
|
||||
.instrument(download_span.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
data.img.take().unwrap()
|
||||
};
|
||||
|
||||
assert_eq!(img_before, img_after);
|
||||
|
||||
// evict_and_wait can timeout, but it doesn't cancel the evicting itself
|
||||
//
|
||||
// ZERO for timeout does not work reliably, so first take up all spawn_blocking slots to
|
||||
// artificially slow it down.
|
||||
let helper = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
match layer
|
||||
.evict_and_wait(std::time::Duration::ZERO)
|
||||
.await
|
||||
.unwrap_err()
|
||||
{
|
||||
EvictionError::Timeout => {
|
||||
// expected, but note that the eviction is "still ongoing"
|
||||
helper.release().await;
|
||||
// exhaust spawn_blocking pool to ensure it is now complete
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle)
|
||||
.await;
|
||||
}
|
||||
other => unreachable!("{other:?}"),
|
||||
}
|
||||
|
||||
// only way to query if a layer is resident is to acquire a ResidentLayer instance.
|
||||
// Layer::keep_resident never downloads, but it might initialize if the layer file is found
|
||||
// downloaded locally.
|
||||
let none = layer.keep_resident().await.unwrap();
|
||||
assert!(
|
||||
none.is_none(),
|
||||
"Expected none, because eviction removed the local file, found: {none:?}"
|
||||
);
|
||||
|
||||
// plain downloading is rarely needed
|
||||
layer
|
||||
.download_and_keep_resident()
|
||||
.instrument(download_span)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// last important part is deletion on drop: gc and compaction use it for compacted L0 layers
|
||||
// or fully garbage collected layers. deletion means deleting the local file, and scheduling a
|
||||
// deletion of the already unlinked from index_part.json remote file.
|
||||
//
|
||||
// marking a layer to be deleted on drop is irreversible; there is no technical reason against
|
||||
// reversiblity, but currently it is not needed so it is not provided.
|
||||
layer.delete_on_drop();
|
||||
|
||||
let path = layer.local_path().to_owned();
|
||||
|
||||
// wait_drop produces an unconnected to Layer future which will resolve when the
|
||||
// LayerInner::drop has completed.
|
||||
let mut wait_drop = std::pin::pin!(layer.wait_drop());
|
||||
|
||||
// paused time doesn't really work well with timeouts and evict_and_wait, so delay pausing
|
||||
// until here
|
||||
tokio::time::pause();
|
||||
tokio::time::timeout(ADVANCE, &mut wait_drop)
|
||||
.await
|
||||
.expect_err("should had timed out because two strong references exist");
|
||||
|
||||
tokio::fs::metadata(&path)
|
||||
.await
|
||||
.expect("the local layer file still exists");
|
||||
|
||||
let rtc = timeline.remote_client.as_ref().unwrap();
|
||||
|
||||
{
|
||||
let layers = &[layer];
|
||||
let mut g = timeline.layers.write().await;
|
||||
g.finish_gc_timeline(layers);
|
||||
// this just updates the remote_physical_size for demonstration purposes
|
||||
rtc.schedule_gc_update(layers).unwrap();
|
||||
}
|
||||
|
||||
// when strong references are dropped, the file is deleted and remote deletion is scheduled
|
||||
wait_drop.await;
|
||||
|
||||
let e = tokio::fs::metadata(&path)
|
||||
.await
|
||||
.expect_err("the local file is deleted");
|
||||
assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
|
||||
|
||||
rtc.wait_completion().await.unwrap();
|
||||
|
||||
assert_eq!(rtc.get_remote_physical_size(), 0);
|
||||
}
|
||||
use crate::task_mgr::BACKGROUND_RUNTIME;
|
||||
use crate::tenant::harness::TenantHarness;
|
||||
|
||||
/// This test demonstrates a previous hang when a eviction and deletion were requested at the same
|
||||
/// time. Now both of them complete per Arc drop semantics.
|
||||
@@ -201,10 +41,10 @@ async fn evict_and_wait_on_wanted_deleted() {
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
{
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
|
||||
@@ -275,10 +115,10 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
|
||||
|
||||
let resident = layer.keep_resident().await.unwrap();
|
||||
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
let mut evict_and_wait = std::pin::pin!(layer.evict_and_wait());
|
||||
|
||||
// drive the future to await on the status channel
|
||||
tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
|
||||
.await
|
||||
.expect_err("should had been a timeout since we are holding the layer resident");
|
||||
assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
|
||||
@@ -298,7 +138,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
|
||||
|
||||
// because the keep_resident check alters wanted evicted without sending a message, we will
|
||||
// never get completed
|
||||
let e = tokio::time::timeout(ADVANCE, &mut evict_and_wait)
|
||||
let e = tokio::time::timeout(std::time::Duration::from_secs(3600), &mut evict_and_wait)
|
||||
.await
|
||||
.expect("no timeout, because keep_resident re-initialized")
|
||||
.expect_err("eviction should not have succeeded because re-initialized");
|
||||
@@ -318,10 +158,9 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
|
||||
.sum::<u64>()
|
||||
);
|
||||
|
||||
let mut second_eviction = std::pin::pin!(layer.evict_and_wait(FOREVER));
|
||||
let mut second_eviction = std::pin::pin!(layer.evict_and_wait());
|
||||
|
||||
// advance to the wait on the queue
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
|
||||
.await
|
||||
.expect_err("timeout because spawn_blocking is clogged");
|
||||
|
||||
@@ -332,12 +171,7 @@ async fn residency_check_while_evict_and_wait_on_clogged_spawn_blocking() {
|
||||
|
||||
helper.release().await;
|
||||
|
||||
// the second_eviction gets to run here
|
||||
//
|
||||
// synchronize to be *strictly* after the second_eviction spawn_blocking run
|
||||
SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(handle).await;
|
||||
|
||||
tokio::time::timeout(ADVANCE, &mut second_eviction)
|
||||
tokio::time::timeout(std::time::Duration::from_secs(3600), &mut second_eviction)
|
||||
.await
|
||||
.expect("eviction goes through now that spawn_blocking is unclogged")
|
||||
.expect("eviction should succeed, because version matches");
|
||||
@@ -427,49 +261,3 @@ impl SpawnBlockingPoolHelper {
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn spawn_blocking_pool_helper_actually_works() {
|
||||
// create a custom runtime for which we know and control how many blocking threads it has
|
||||
//
|
||||
// because the amount is not configurable for our helper, expect the same amount as
|
||||
// BACKGROUND_RUNTIME using the tokio defaults would have.
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.max_blocking_threads(512)
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
let handle = rt.handle();
|
||||
|
||||
rt.block_on(async move {
|
||||
// this will not return until all threads are spun up and actually executing the code
|
||||
// waiting on `consumed` to be `SpawnBlockingPoolHelper::release`'d.
|
||||
let consumed = SpawnBlockingPoolHelper::consume_all_spawn_blocking_threads(handle).await;
|
||||
|
||||
println!("consumed");
|
||||
|
||||
let mut jh = std::pin::pin!(tokio::task::spawn_blocking(move || {
|
||||
// this will not get to run before we release
|
||||
}));
|
||||
|
||||
println!("spawned");
|
||||
|
||||
tokio::time::timeout(std::time::Duration::from_secs(1), &mut jh)
|
||||
.await
|
||||
.expect_err("the task should not have gotten to run yet");
|
||||
|
||||
println!("tried to join");
|
||||
|
||||
consumed.release().await;
|
||||
|
||||
println!("released");
|
||||
|
||||
tokio::time::timeout(std::time::Duration::from_secs(1), jh)
|
||||
.await
|
||||
.expect("no timeout")
|
||||
.expect("no join error");
|
||||
|
||||
println!("joined");
|
||||
});
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@ mod walreceiver;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use camino::Utf8Path;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use futures::stream::StreamExt;
|
||||
@@ -1512,14 +1512,10 @@ impl Timeline {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
// curl has this by default
|
||||
let timeout = std::time::Duration::from_secs(120);
|
||||
|
||||
match local_layer.evict_and_wait(timeout).await {
|
||||
match local_layer.evict_and_wait().await {
|
||||
Ok(()) => Ok(Some(true)),
|
||||
Err(EvictionError::NotFound) => Ok(Some(false)),
|
||||
Err(EvictionError::Downloaded) => Ok(Some(false)),
|
||||
Err(EvictionError::Timeout) => Ok(Some(false)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3422,10 +3418,26 @@ impl Timeline {
|
||||
let _g = span.entered();
|
||||
let new_delta =
|
||||
Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
|
||||
let new_delta_path = new_delta.local_path().to_owned();
|
||||
|
||||
// The write_to_disk() above calls writer.finish() which already did the fsync of the inodes.
|
||||
// We just need to fsync the directory in which these inodes are linked,
|
||||
// which we know to be the timeline directory.
|
||||
// Sync it to disk.
|
||||
//
|
||||
// We must also fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable.
|
||||
//
|
||||
// NB: timeline dir must be synced _after_ the file contents are durable.
|
||||
// So, two separate fsyncs are required, they mustn't be batched.
|
||||
//
|
||||
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
|
||||
// files to flush, the fsync overhead can be reduces as follows:
|
||||
// 1. write them all to temporary file names
|
||||
// 2. fsync them
|
||||
// 3. rename to the final name
|
||||
// 4. fsync the parent directory.
|
||||
// Note that (1),(2),(3) today happen inside write_to_disk().
|
||||
//
|
||||
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
|
||||
par_fsync::par_fsync(&[new_delta_path]).context("fsync of delta layer")?;
|
||||
par_fsync::par_fsync(&[self_clone
|
||||
.conf
|
||||
.timeline_path(&self_clone.tenant_shard_id, &self_clone.timeline_id)])
|
||||
@@ -3658,10 +3670,25 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// The writer.finish() above already did the fsync of the inodes.
|
||||
// We just need to fsync the directory in which these inodes are linked,
|
||||
// which we know to be the timeline directory.
|
||||
if !image_layers.is_empty() {
|
||||
// Sync the new layer to disk before adding it to the layer map, to make sure
|
||||
// we don't garbage collect something based on the new layer, before it has
|
||||
// reached the disk.
|
||||
//
|
||||
// We must also fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
//
|
||||
// Compaction creates multiple image layers. It would be better to create them all
|
||||
// and fsync them all in parallel.
|
||||
let all_paths = image_layers
|
||||
.iter()
|
||||
.map(|layer| layer.local_path().to_owned())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
par_fsync::par_fsync_async(&all_paths)
|
||||
.await
|
||||
.context("fsync of newly created layer files")?;
|
||||
|
||||
if !all_paths.is_empty() {
|
||||
par_fsync::par_fsync_async(&[self
|
||||
.conf
|
||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id)])
|
||||
@@ -4248,12 +4275,22 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// The writer.finish() above already did the fsync of the inodes.
|
||||
// We just need to fsync the directory in which these inodes are linked,
|
||||
// which we know to be the timeline directory.
|
||||
// FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
|
||||
let layer_paths: Vec<Utf8PathBuf> = new_layers
|
||||
.iter()
|
||||
.map(|l| l.local_path().to_owned())
|
||||
.collect();
|
||||
|
||||
// Fsync all the layer files and directory using multiple threads to
|
||||
// minimize latency.
|
||||
par_fsync::par_fsync_async(&layer_paths)
|
||||
.await
|
||||
.context("fsync all new layers")?;
|
||||
|
||||
let timeline_dir = self
|
||||
.conf
|
||||
.timeline_path(&self.tenant_shard_id, &self.timeline_id);
|
||||
|
||||
par_fsync::par_fsync_async(&[timeline_dir])
|
||||
.await
|
||||
.context("fsync of timeline dir")?;
|
||||
@@ -5120,7 +5157,8 @@ mod tests {
|
||||
let harness =
|
||||
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let ctx = any_context();
|
||||
let tenant = harness.do_try_load(&ctx).await.unwrap();
|
||||
let timeline = tenant
|
||||
.create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
|
||||
.await
|
||||
@@ -5134,10 +5172,8 @@ mod tests {
|
||||
.expect("should had been resident")
|
||||
.drop_eviction_guard();
|
||||
|
||||
let forever = std::time::Duration::from_secs(120);
|
||||
|
||||
let first = layer.evict_and_wait(forever);
|
||||
let second = layer.evict_and_wait(forever);
|
||||
let first = async { layer.evict_and_wait().await };
|
||||
let second = async { layer.evict_and_wait().await };
|
||||
|
||||
let (first, second) = tokio::join!(first, second);
|
||||
|
||||
@@ -5156,6 +5192,12 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn any_context() -> crate::context::RequestContext {
|
||||
use crate::context::*;
|
||||
use crate::task_mgr::*;
|
||||
RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
|
||||
}
|
||||
|
||||
async fn find_some_layer(timeline: &Timeline) -> Layer {
|
||||
let layers = timeline.layers.read().await;
|
||||
let desc = layers
|
||||
|
||||
@@ -75,13 +75,14 @@ impl Timeline {
|
||||
|
||||
let keyspace = self.collect_keyspace(end_lsn, ctx).await?;
|
||||
let mut adaptor = TimelineAdaptor::new(self, (end_lsn, keyspace));
|
||||
let ctx_adaptor = RequestContextAdaptor(ctx.clone());
|
||||
|
||||
pageserver_compaction::compact_tiered::compact_tiered(
|
||||
&mut adaptor,
|
||||
end_lsn,
|
||||
target_file_size,
|
||||
fanout,
|
||||
ctx,
|
||||
&ctx_adaptor,
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -142,13 +143,13 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
type DeltaLayer = ResidentDeltaLayer;
|
||||
type ImageLayer = ResidentImageLayer;
|
||||
|
||||
type RequestContext = crate::context::RequestContext;
|
||||
type RequestContext = RequestContextAdaptor;
|
||||
|
||||
async fn get_layers(
|
||||
&mut self,
|
||||
key_range: &Range<Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
_ctx: &RequestContext,
|
||||
_ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<Vec<OwnArc<PersistentLayerDesc>>> {
|
||||
self.flush_updates().await?;
|
||||
|
||||
@@ -169,7 +170,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
&mut self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
_ctx: &RequestContext,
|
||||
_ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<Vec<Range<Key>>> {
|
||||
if lsn == self.keyspace.0 {
|
||||
Ok(pageserver_compaction::helpers::intersect_keyspace(
|
||||
@@ -205,7 +206,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<()> {
|
||||
Ok(self.create_image_impl(lsn, key_range, ctx).await?)
|
||||
}
|
||||
@@ -215,7 +216,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
lsn_range: &Range<Lsn>,
|
||||
key_range: &Range<Key>,
|
||||
input_layers: &[ResidentDeltaLayer],
|
||||
ctx: &RequestContext,
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
|
||||
|
||||
@@ -286,7 +287,7 @@ impl CompactionJobExecutor for TimelineAdaptor {
|
||||
async fn delete_layer(
|
||||
&mut self,
|
||||
layer: &OwnArc<PersistentLayerDesc>,
|
||||
_ctx: &RequestContext,
|
||||
_ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<()> {
|
||||
self.layers_to_delete.push(layer.clone().0);
|
||||
Ok(())
|
||||
@@ -298,7 +299,7 @@ impl TimelineAdaptor {
|
||||
&mut self,
|
||||
lsn: Lsn,
|
||||
key_range: &Range<Key>,
|
||||
ctx: &RequestContext,
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> Result<(), PageReconstructError> {
|
||||
let timer = self.timeline.metrics.create_images_time_histo.start_timer();
|
||||
|
||||
@@ -360,7 +361,17 @@ impl TimelineAdaptor {
|
||||
}
|
||||
}
|
||||
|
||||
impl CompactionRequestContext for crate::context::RequestContext {}
|
||||
pub struct RequestContextAdaptor(pub RequestContext);
|
||||
|
||||
impl std::ops::Deref for RequestContextAdaptor {
|
||||
type Target = RequestContext;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl CompactionRequestContext for RequestContextAdaptor {}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct OwnArc<T>(pub Arc<T>);
|
||||
@@ -438,7 +449,10 @@ impl CompactionLayer<Key> for ResidentDeltaLayer {
|
||||
impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
|
||||
type DeltaEntry<'a> = DeltaEntry<'a>;
|
||||
|
||||
async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
|
||||
async fn load_keys<'a>(
|
||||
&self,
|
||||
ctx: &RequestContextAdaptor,
|
||||
) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
|
||||
self.0.load_keys(ctx).await
|
||||
}
|
||||
}
|
||||
|
||||
@@ -204,7 +204,6 @@ impl Timeline {
|
||||
evicted: usize,
|
||||
errors: usize,
|
||||
not_evictable: usize,
|
||||
timeouts: usize,
|
||||
#[allow(dead_code)]
|
||||
skipped_for_shutdown: usize,
|
||||
}
|
||||
@@ -268,11 +267,7 @@ impl Timeline {
|
||||
let layer = guard.drop_eviction_guard();
|
||||
if no_activity_for > p.threshold {
|
||||
// this could cause a lot of allocations in some cases
|
||||
js.spawn(async move {
|
||||
layer
|
||||
.evict_and_wait(std::time::Duration::from_secs(5))
|
||||
.await
|
||||
});
|
||||
js.spawn(async move { layer.evict_and_wait().await });
|
||||
stats.candidates += 1;
|
||||
}
|
||||
}
|
||||
@@ -285,9 +280,6 @@ impl Timeline {
|
||||
Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
|
||||
stats.not_evictable += 1;
|
||||
}
|
||||
Ok(Err(EvictionError::Timeout)) => {
|
||||
stats.timeouts += 1;
|
||||
}
|
||||
Err(je) if je.is_cancelled() => unreachable!("not used"),
|
||||
Err(je) if je.is_panic() => {
|
||||
/* already logged */
|
||||
@@ -303,8 +295,7 @@ impl Timeline {
|
||||
stats = join_all => {
|
||||
if stats.candidates == stats.not_evictable {
|
||||
debug!(stats=?stats, "eviction iteration complete");
|
||||
} else if stats.errors > 0 || stats.not_evictable > 0 || stats.timeouts > 0 {
|
||||
// reminder: timeouts are not eviction cancellations
|
||||
} else if stats.errors > 0 || stats.not_evictable > 0 {
|
||||
warn!(stats=?stats, "eviction iteration complete");
|
||||
} else {
|
||||
info!(stats=?stats, "eviction iteration complete");
|
||||
|
||||
@@ -1667,6 +1667,8 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::remote_timeline_client::{remote_initdb_archive_path, INITDB_PATH};
|
||||
use crate::tenant::Timeline;
|
||||
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
|
||||
@@ -262,7 +262,7 @@ impl PostgresRedoManager {
|
||||
// next request will launch a new one.
|
||||
if let Err(e) = result.as_ref() {
|
||||
error!(
|
||||
"error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
|
||||
"error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
|
||||
records.len(),
|
||||
records.first().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
records.last().map(|p| p.0).unwrap_or(Lsn(0)),
|
||||
|
||||
@@ -252,6 +252,8 @@ mod test {
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
|
||||
|
||||
/// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
|
||||
#[test]
|
||||
fn apply_aux_file_deltas() -> anyhow::Result<()> {
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use tracing;
|
||||
use tracing::error;
|
||||
use tracing::info;
|
||||
use tracing::instrument;
|
||||
use tracing::{error, info};
|
||||
|
||||
use crate::metrics::WalRedoKillCause;
|
||||
use crate::metrics::WAL_REDO_PROCESS_COUNTERS;
|
||||
|
||||
@@ -21,7 +21,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
|
||||
SHLIB_LINK = -lcurl
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql
|
||||
DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
EXTRA_CLEAN = \
|
||||
|
||||
@@ -25,8 +25,6 @@
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "common/hashfn.h"
|
||||
#include "lib/hyperloglog.h"
|
||||
#include "pgstat.h"
|
||||
#include "postmaster/bgworker.h"
|
||||
#include RELFILEINFO_HDR
|
||||
@@ -62,7 +60,6 @@
|
||||
#define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */
|
||||
#define MB ((uint64)1024*1024)
|
||||
|
||||
#define HYPER_LOG_LOG_BIT_WIDTH 10
|
||||
#define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
|
||||
|
||||
typedef struct FileCacheEntry
|
||||
@@ -87,8 +84,6 @@ typedef struct FileCacheControl
|
||||
uint64 writes;
|
||||
dlist_head lru; /* double linked list for LRU replacement
|
||||
* algorithm */
|
||||
hyperLogLogState wss_estimation; /* estimation of wroking set size */
|
||||
uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
|
||||
} FileCacheControl;
|
||||
|
||||
static HTAB *lfc_hash;
|
||||
@@ -237,14 +232,6 @@ lfc_shmem_startup(void)
|
||||
lfc_ctl->writes = 0;
|
||||
dlist_init(&lfc_ctl->lru);
|
||||
|
||||
/* Initialize hyper-log-log structure for estimating working set size */
|
||||
initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
|
||||
|
||||
/* We need hashes in shared memory */
|
||||
pfree(lfc_ctl->wss_estimation.hashesArr);
|
||||
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
|
||||
lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
|
||||
|
||||
/* Recreate file cache on restart */
|
||||
fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
|
||||
if (fd < 0)
|
||||
@@ -542,11 +529,6 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
|
||||
}
|
||||
|
||||
entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
|
||||
|
||||
/* Approximate working set */
|
||||
tag.blockNum = blkno;
|
||||
addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
|
||||
|
||||
if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
|
||||
{
|
||||
/* Page is not cached */
|
||||
@@ -985,21 +967,3 @@ local_cache_pages(PG_FUNCTION_ARGS)
|
||||
else
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(approximate_working_set_size);
|
||||
|
||||
Datum
|
||||
approximate_working_set_size(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 dc = -1;
|
||||
if (lfc_size_limit != 0)
|
||||
{
|
||||
bool reset = PG_GETARG_BOOL(0);
|
||||
LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
|
||||
dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
|
||||
if (reset)
|
||||
memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
|
||||
LWLockRelease(lfc_lock);
|
||||
}
|
||||
PG_RETURN_INT32(dc);
|
||||
}
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
\echo Use "ALTER EXTENSION neon UPDATE TO '1.3'" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION approximate_working_set_size(reset bool)
|
||||
RETURNS integer
|
||||
AS 'MODULE_PATHNAME', 'approximate_working_set_size'
|
||||
LANGUAGE C PARALLEL SAFE;
|
||||
|
||||
GRANT EXECUTE ON FUNCTION approximate_working_set_size(bool) TO pg_monitor;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# neon extension
|
||||
comment = 'cloud storage for PostgreSQL'
|
||||
default_version = '1.3'
|
||||
default_version = '1.2'
|
||||
module_pathname = '$libdir/neon'
|
||||
relocatable = true
|
||||
trusted = true
|
||||
|
||||
@@ -13,7 +13,7 @@ use proxy::proxy::run_until_cancelled;
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use clap::Arg;
|
||||
use clap::{self, Arg};
|
||||
use futures::TryFutureExt;
|
||||
use proxy::console::messages::MetricsAuxInfo;
|
||||
use proxy::stream::{PqStream, Stream};
|
||||
|
||||
3
proxy/src/cache/project_info.rs
vendored
3
proxy/src/cache/project_info.rs
vendored
@@ -358,7 +358,8 @@ impl Cache for ProjectInfoCacheImpl {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::scram::ServerSecret;
|
||||
use crate::{console::AuthSecret, scram::ServerSecret};
|
||||
use std::{sync::Arc, time::Duration};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_project_info_cache_settings() {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::Deserialize;
|
||||
use std::fmt;
|
||||
|
||||
use crate::auth::IpPattern;
|
||||
@@ -98,16 +98,7 @@ pub struct MetricsAuxInfo {
|
||||
pub endpoint_id: EndpointId,
|
||||
pub project_id: ProjectId,
|
||||
pub branch_id: BranchId,
|
||||
pub cold_start_info: Option<ColdStartInfo>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ColdStartInfo {
|
||||
Unknown = 0,
|
||||
Warm = 1,
|
||||
PoolHit = 2,
|
||||
PoolMiss = 3,
|
||||
pub is_cold_start: Option<bool>,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -120,7 +111,6 @@ mod tests {
|
||||
"endpoint_id": "endpoint",
|
||||
"project_id": "project",
|
||||
"branch_id": "branch",
|
||||
"cold_start_info": "unknown",
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::{
|
||||
};
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
|
||||
use postgres_backend::{self, AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
|
||||
use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
|
||||
use std::{convert::Infallible, future};
|
||||
use tokio::net::{TcpListener, TcpStream};
|
||||
|
||||
@@ -9,7 +9,7 @@ use tracing::{field::display, info_span, Span};
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::{
|
||||
console::messages::{ColdStartInfo, MetricsAuxInfo},
|
||||
console::messages::MetricsAuxInfo,
|
||||
error::ErrorKind,
|
||||
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
|
||||
BranchId, DbName, EndpointId, ProjectId, RoleName,
|
||||
@@ -42,7 +42,7 @@ pub struct RequestMonitoring {
|
||||
error_kind: Option<ErrorKind>,
|
||||
pub(crate) auth_method: Option<AuthMethod>,
|
||||
success: bool,
|
||||
cold_start_info: Option<ColdStartInfo>,
|
||||
is_cold_start: Option<bool>,
|
||||
|
||||
// extra
|
||||
// This sender is here to keep the request monitoring channel open while requests are taking place.
|
||||
@@ -91,7 +91,7 @@ impl RequestMonitoring {
|
||||
error_kind: None,
|
||||
auth_method: None,
|
||||
success: false,
|
||||
cold_start_info: None,
|
||||
is_cold_start: None,
|
||||
|
||||
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
|
||||
latency_timer: LatencyTimer::new(protocol),
|
||||
@@ -115,7 +115,7 @@ impl RequestMonitoring {
|
||||
self.set_endpoint_id(x.endpoint_id);
|
||||
self.branch = Some(x.branch_id);
|
||||
self.project = Some(x.project_id);
|
||||
self.cold_start_info = x.cold_start_info;
|
||||
self.is_cold_start = x.is_cold_start;
|
||||
}
|
||||
|
||||
pub fn set_project_id(&mut self, project_id: ProjectId) {
|
||||
|
||||
@@ -93,7 +93,7 @@ struct RequestData {
|
||||
/// Or if we make it to proxy_pass
|
||||
success: bool,
|
||||
/// Indicates if the cplane started the new compute node for this request.
|
||||
cold_start_info: Option<String>,
|
||||
is_cold_start: Option<bool>,
|
||||
/// Tracks time from session start (HTTP request/libpq TCP handshake)
|
||||
/// Through to success/failure
|
||||
duration_us: u64,
|
||||
@@ -121,10 +121,7 @@ impl From<RequestMonitoring> for RequestData {
|
||||
region: value.region,
|
||||
error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
|
||||
success: value.success,
|
||||
cold_start_info: value
|
||||
.cold_start_info
|
||||
.as_ref()
|
||||
.map(|x| serde_json::to_string(x).unwrap_or_default()),
|
||||
is_cold_start: value.is_cold_start,
|
||||
duration_us: SystemTime::from(value.first_packet)
|
||||
.elapsed()
|
||||
.unwrap_or_default()
|
||||
@@ -458,7 +455,7 @@ mod tests {
|
||||
region: "us-east-1",
|
||||
error: None,
|
||||
success: rng.gen(),
|
||||
cold_start_info: Some("no".into()),
|
||||
is_cold_start: Some(true),
|
||||
duration_us: rng.gen_range(0..30_000_000),
|
||||
}
|
||||
}
|
||||
@@ -528,16 +525,16 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1314406, 3, 6000),
|
||||
(1314399, 3, 6000),
|
||||
(1314459, 3, 6000),
|
||||
(1314416, 3, 6000),
|
||||
(1314546, 3, 6000),
|
||||
(1314388, 3, 6000),
|
||||
(1314180, 3, 6000),
|
||||
(1314416, 3, 6000),
|
||||
(438359, 1, 2000)
|
||||
]
|
||||
(1315032, 3, 6000),
|
||||
(1315025, 3, 6000),
|
||||
(1315085, 3, 6000),
|
||||
(1315042, 3, 6000),
|
||||
(1315172, 3, 6000),
|
||||
(1315014, 3, 6000),
|
||||
(1314806, 3, 6000),
|
||||
(1315042, 3, 6000),
|
||||
(438563, 1, 2000)
|
||||
],
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
@@ -566,12 +563,12 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1220668, 5, 10000),
|
||||
(1226818, 5, 10000),
|
||||
(1228612, 5, 10000),
|
||||
(1227974, 5, 10000),
|
||||
(1219252, 5, 10000)
|
||||
]
|
||||
(1220433, 5, 10000),
|
||||
(1226583, 5, 10000),
|
||||
(1228377, 5, 10000),
|
||||
(1227739, 5, 10000),
|
||||
(1219017, 5, 10000)
|
||||
],
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
@@ -602,12 +599,12 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1206315, 5, 10000),
|
||||
(1206046, 5, 10000),
|
||||
(1206339, 5, 10000),
|
||||
(1206327, 5, 10000),
|
||||
(1206582, 5, 10000)
|
||||
]
|
||||
(1206080, 5, 10000),
|
||||
(1205811, 5, 10000),
|
||||
(1206104, 5, 10000),
|
||||
(1206092, 5, 10000),
|
||||
(1206347, 5, 10000)
|
||||
],
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
@@ -631,16 +628,16 @@ mod tests {
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[
|
||||
(1314406, 3, 6000),
|
||||
(1314399, 3, 6000),
|
||||
(1314459, 3, 6000),
|
||||
(1314416, 3, 6000),
|
||||
(1314546, 3, 6000),
|
||||
(1314388, 3, 6000),
|
||||
(1314180, 3, 6000),
|
||||
(1314416, 3, 6000),
|
||||
(438359, 1, 2000)
|
||||
]
|
||||
(1315032, 3, 6000),
|
||||
(1315025, 3, 6000),
|
||||
(1315085, 3, 6000),
|
||||
(1315042, 3, 6000),
|
||||
(1315172, 3, 6000),
|
||||
(1315014, 3, 6000),
|
||||
(1314806, 3, 6000),
|
||||
(1315042, 3, 6000),
|
||||
(438563, 1, 2000)
|
||||
],
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
@@ -676,7 +673,7 @@ mod tests {
|
||||
// files are smaller than the size threshold, but they took too long to fill so were flushed early
|
||||
assert_eq!(
|
||||
file_stats,
|
||||
[(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)]
|
||||
[(659129, 2, 3001), (658842, 2, 3000), (658638, 2, 2999)],
|
||||
);
|
||||
|
||||
tmpdir.close().unwrap();
|
||||
|
||||
@@ -16,7 +16,7 @@ use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBacken
|
||||
use crate::console::{self, CachedNodeInfo, NodeInfo};
|
||||
use crate::error::ErrorKind;
|
||||
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
|
||||
use crate::{http, sasl, scram};
|
||||
use crate::{auth, http, sasl, scram};
|
||||
use anyhow::{bail, Context};
|
||||
use async_trait::async_trait;
|
||||
use rstest::rstest;
|
||||
|
||||
@@ -11,6 +11,7 @@ use bytes::{Bytes, BytesMut};
|
||||
use futures::{SinkExt, StreamExt};
|
||||
use postgres_protocol::message::frontend;
|
||||
use tokio::io::{AsyncReadExt, DuplexStream};
|
||||
use tokio_postgres::config::SslMode;
|
||||
use tokio_postgres::tls::TlsConnect;
|
||||
use tokio_util::codec::{Decoder, Encoder};
|
||||
|
||||
|
||||
@@ -667,6 +667,7 @@ impl<C: ClientInnerExt> Drop for Client<C> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use env_logger;
|
||||
use std::{mem, sync::atomic::AtomicBool};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -19,6 +19,8 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId};
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
use std::convert::TryInto;
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 7;
|
||||
|
||||
@@ -217,9 +219,12 @@ impl Storage for FileStorage {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::FileStorage;
|
||||
use super::*;
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::Result;
|
||||
use tokio::fs;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::{id::TenantTimelineId, lsn::Lsn};
|
||||
|
||||
fn stub_conf() -> SafeKeeperConf {
|
||||
let workdir = camino_tempfile::tempdir().unwrap().into_path();
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
//! protocol commands.
|
||||
|
||||
use anyhow::Context;
|
||||
use std::str::{self, FromStr};
|
||||
use std::str::FromStr;
|
||||
use std::str::{self};
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
use tracing::{debug, info, info_span, Instrument};
|
||||
@@ -15,8 +16,8 @@ use crate::safekeeper::Term;
|
||||
use crate::timeline::TimelineError;
|
||||
use crate::wal_service::ConnectionId;
|
||||
use crate::{GlobalTimelines, SafeKeeperConf};
|
||||
use postgres_backend::PostgresBackend;
|
||||
use postgres_backend::QueryError;
|
||||
use postgres_backend::{self, PostgresBackend};
|
||||
use postgres_ffi::PG_TLI;
|
||||
use pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID, TEXT_OID};
|
||||
use regex::Regex;
|
||||
|
||||
@@ -2180,11 +2180,6 @@ class NeonAttachmentService(MetricsGetter):
|
||||
self.stop(immediate=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LogCursor:
|
||||
_line_no: int
|
||||
|
||||
|
||||
class NeonPageserver(PgProtocol):
|
||||
"""
|
||||
An object representing a running pageserver.
|
||||
@@ -2348,18 +2343,7 @@ class NeonPageserver(PgProtocol):
|
||||
value = self.http_client().get_metric_value(metric)
|
||||
assert value == 0, f"Nonzero {metric} == {value}"
|
||||
|
||||
def assert_log_contains(
|
||||
self, pattern: str, offset: None | LogCursor = None
|
||||
) -> Tuple[str, LogCursor]:
|
||||
"""Convenient for use inside wait_until()"""
|
||||
|
||||
res = self.log_contains(pattern, offset=offset)
|
||||
assert res is not None
|
||||
return res
|
||||
|
||||
def log_contains(
|
||||
self, pattern: str, offset: None | LogCursor = None
|
||||
) -> Optional[Tuple[str, LogCursor]]:
|
||||
def log_contains(self, pattern: str) -> Optional[str]:
|
||||
"""Check that the pageserver log contains a line that matches the given regex"""
|
||||
logfile = self.workdir / "pageserver.log"
|
||||
if not logfile.exists():
|
||||
@@ -2373,17 +2357,12 @@ class NeonPageserver(PgProtocol):
|
||||
# no guarantee it is already present in the log file. This hasn't
|
||||
# been a problem in practice, our python tests are not fast enough
|
||||
# to hit that race condition.
|
||||
skip_until_line_no = 0 if offset is None else offset._line_no
|
||||
cur_line_no = 0
|
||||
with logfile.open("r") as f:
|
||||
for line in f:
|
||||
if cur_line_no < skip_until_line_no:
|
||||
cur_line_no += 1
|
||||
continue
|
||||
if contains_re.search(line):
|
||||
# found it!
|
||||
cur_line_no += 1
|
||||
return (line, LogCursor(cur_line_no))
|
||||
return line
|
||||
|
||||
return None
|
||||
|
||||
def tenant_attach(
|
||||
|
||||
@@ -286,11 +286,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_location_conf(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
location_conf=dict[str, Any],
|
||||
flush_ms=None,
|
||||
lazy: Optional[bool] = None,
|
||||
self, tenant_id: Union[TenantId, TenantShardId], location_conf=dict[str, Any], flush_ms=None
|
||||
):
|
||||
body = location_conf.copy()
|
||||
body["tenant_id"] = str(tenant_id)
|
||||
@@ -299,9 +295,6 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
if flush_ms is not None:
|
||||
params["flush_ms"] = str(flush_ms)
|
||||
|
||||
if lazy is not None:
|
||||
params["lazy"] = "true" if lazy else "false"
|
||||
|
||||
res = self.put(
|
||||
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
|
||||
json=body,
|
||||
|
||||
@@ -20,7 +20,7 @@ def assert_tenant_state(
|
||||
tenant: TenantId,
|
||||
expected_state: str,
|
||||
message: Optional[str] = None,
|
||||
) -> None:
|
||||
):
|
||||
tenant_status = pageserver_http.tenant_status(tenant)
|
||||
log.info(f"tenant_status: {tenant_status}")
|
||||
assert tenant_status["state"]["slug"] == expected_state, message or tenant_status
|
||||
@@ -206,8 +206,8 @@ def wait_for_last_record_lsn(
|
||||
return current_lsn
|
||||
if i % 10 == 0:
|
||||
log.info(
|
||||
"{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
|
||||
tenant, timeline, lsn, current_lsn, i + 1
|
||||
"waiting for last_record_lsn to reach {}, now {}, iteration {}".format(
|
||||
lsn, current_lsn, i + 1
|
||||
)
|
||||
)
|
||||
time.sleep(0.1)
|
||||
@@ -292,7 +292,7 @@ def timeline_delete_wait_completed(
|
||||
iterations: int = 20,
|
||||
interval: Optional[float] = None,
|
||||
**delete_args,
|
||||
) -> None:
|
||||
):
|
||||
pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
|
||||
wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
|
||||
|
||||
@@ -302,7 +302,7 @@ def assert_prefix_empty(
|
||||
remote_storage: Optional[RemoteStorage],
|
||||
prefix: Optional[str] = None,
|
||||
allowed_postfix: Optional[str] = None,
|
||||
) -> None:
|
||||
):
|
||||
assert remote_storage is not None
|
||||
response = list_prefix(remote_storage, prefix)
|
||||
keys = response["KeyCount"]
|
||||
|
||||
@@ -252,16 +252,6 @@ class S3Storage:
|
||||
|
||||
log.info(f"deleted {cnt} objects from remote storage")
|
||||
|
||||
def tenant_path(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.prefix_in_bucket}/tenants/{tenant_id}"
|
||||
|
||||
def heatmap_key(self, tenant_id: TenantId) -> str:
|
||||
return f"{self.tenant_path(tenant_id)}/{TENANT_HEATMAP_FILE_NAME}"
|
||||
|
||||
def heatmap_content(self, tenant_id: TenantId):
|
||||
r = self.client.get_object(Bucket=self.bucket_name, Key=self.heatmap_key(tenant_id))
|
||||
return json.loads(r["Body"].read().decode("utf-8"))
|
||||
|
||||
|
||||
RemoteStorage = Union[LocalFsStorage, S3Storage]
|
||||
|
||||
|
||||
@@ -369,12 +369,7 @@ def start_in_background(
|
||||
return spawned_process
|
||||
|
||||
|
||||
WaitUntilRet = TypeVar("WaitUntilRet")
|
||||
|
||||
|
||||
def wait_until(
|
||||
number_of_iterations: int, interval: float, func: Callable[[], WaitUntilRet]
|
||||
) -> WaitUntilRet:
|
||||
def wait_until(number_of_iterations: int, interval: float, func: Fn):
|
||||
"""
|
||||
Wait until 'func' returns successfully, without exception. Returns the
|
||||
last return value from the function.
|
||||
@@ -392,18 +387,6 @@ def wait_until(
|
||||
raise Exception("timed out while waiting for %s" % func) from last_exception
|
||||
|
||||
|
||||
def assert_eq(a, b) -> None:
|
||||
assert a == b
|
||||
|
||||
|
||||
def assert_gt(a, b) -> None:
|
||||
assert a > b
|
||||
|
||||
|
||||
def assert_ge(a, b) -> None:
|
||||
assert a >= b
|
||||
|
||||
|
||||
def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
|
||||
"""
|
||||
Fast way to populate data.
|
||||
|
||||
@@ -63,11 +63,10 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
|
||||
]
|
||||
)
|
||||
|
||||
wait_until(
|
||||
50,
|
||||
0.1,
|
||||
lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
|
||||
)
|
||||
def log_contains_bad_request():
|
||||
env.pageserver.log_contains(".*Error processing HTTP request: Bad request")
|
||||
|
||||
wait_until(50, 0.1, log_contains_bad_request)
|
||||
|
||||
|
||||
def test_null_body(negative_env: NegativeTests):
|
||||
|
||||
@@ -200,7 +200,7 @@ class EvictionEnv:
|
||||
tenant_ps.http_client().timeline_wait_logical_size(tenant_id, timeline_id)
|
||||
|
||||
def statvfs_called():
|
||||
pageserver.assert_log_contains(".*running mocked statvfs.*")
|
||||
assert pageserver.log_contains(".*running mocked statvfs.*")
|
||||
|
||||
# we most likely have already completed multiple runs
|
||||
wait_until(10, 1, statvfs_called)
|
||||
@@ -533,7 +533,7 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
|
||||
assert actual_change >= target, "eviction must always evict more than target"
|
||||
|
||||
time.sleep(1) # give log time to flush
|
||||
env.neon_env.pageserver.assert_log_contains(GLOBAL_LRU_LOG_LINE)
|
||||
assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE)
|
||||
env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE)
|
||||
|
||||
|
||||
@@ -767,7 +767,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
|
||||
assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO")
|
||||
env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO")
|
||||
|
||||
|
||||
@@ -801,9 +801,10 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
wait_until(
|
||||
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
|
||||
)
|
||||
def relieved_log_message():
|
||||
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
|
||||
|
||||
wait_until(10, 1, relieved_log_message)
|
||||
|
||||
def less_than_max_usage_pct():
|
||||
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
|
||||
@@ -844,9 +845,10 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
|
||||
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
|
||||
)
|
||||
|
||||
wait_until(
|
||||
10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
|
||||
)
|
||||
def relieved_log_message():
|
||||
assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved")
|
||||
|
||||
wait_until(10, 1, relieved_log_message)
|
||||
|
||||
def more_than_min_avail_bytes_freed():
|
||||
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
|
||||
|
||||
@@ -36,7 +36,7 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
pg_bin.run_capture(["pgbench", "-i", "-s1", connstr])
|
||||
|
||||
time.sleep(10) # let compaction to be performed
|
||||
env.pageserver.assert_log_contains("compact-level0-phase1-return-same")
|
||||
assert env.pageserver.log_contains("compact-level0-phase1-return-same")
|
||||
|
||||
|
||||
def test_actually_duplicated_l1(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
|
||||
def test_explain_with_lfc_stats(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
cache_dir = Path(env.repo_dir) / "file_cache"
|
||||
cache_dir.mkdir(exist_ok=True)
|
||||
|
||||
branchname = "test_explain_with_lfc_stats"
|
||||
env.neon_cli.create_branch(branchname, "empty")
|
||||
log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
|
||||
endpoint = env.endpoints.create_start(
|
||||
branchname,
|
||||
config_lines=[
|
||||
"shared_buffers='1MB'",
|
||||
f"neon.file_cache_path='{cache_dir}/file.cache'",
|
||||
"neon.max_file_cache_size='128MB'",
|
||||
"neon.file_cache_size_limit='64MB'",
|
||||
],
|
||||
)
|
||||
|
||||
cur = endpoint.connect().cursor()
|
||||
|
||||
log.info(f"preparing some data in {endpoint.connstr()}")
|
||||
|
||||
ddl = """
|
||||
CREATE TABLE pgbench_accounts (
|
||||
aid bigint NOT NULL,
|
||||
bid integer,
|
||||
abalance integer,
|
||||
filler character(84),
|
||||
-- more web-app like columns
|
||||
text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5),
|
||||
jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
|
||||
)
|
||||
WITH (fillfactor='100');
|
||||
"""
|
||||
|
||||
cur.execute(ddl)
|
||||
cur.execute(
|
||||
"insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
|
||||
)
|
||||
|
||||
log.info(f"warming up caches with sequential scan in {endpoint.connstr()}")
|
||||
cur.execute("SELECT * FROM pgbench_accounts WHERE abalance > 0")
|
||||
|
||||
log.info("running explain analyze without LFC values to verify they do not show up in the plan")
|
||||
cur.execute("EXPLAIN (ANALYZE, BUFFERS) SELECT * FROM pgbench_accounts WHERE abalance > 0")
|
||||
rows = cur.fetchall()
|
||||
plan = "\n".join(r[0] for r in rows)
|
||||
log.debug(plan)
|
||||
assert "Seq Scan on pgbench_accounts" in plan
|
||||
assert "Buffers: shared hit" in plan
|
||||
assert "File cache: hits=" not in plan
|
||||
log.info("running explain analyze WITH LFC values to verify they do now show up")
|
||||
cur.execute(
|
||||
"EXPLAIN (ANALYZE, BUFFERS,FILECACHE) SELECT * FROM pgbench_accounts WHERE abalance > 0"
|
||||
)
|
||||
rows = cur.fetchall()
|
||||
plan = "\n".join(r[0] for r in rows)
|
||||
log.debug(plan)
|
||||
assert "Seq Scan on pgbench_accounts" in plan
|
||||
assert "Buffers: shared hit" in plan
|
||||
assert "File cache: hits=" in plan
|
||||
log.info("running explain analyze WITH LFC values to verify json output")
|
||||
cur.execute(
|
||||
"EXPLAIN (ANALYZE, BUFFERS,FILECACHE, FORMAT JSON) SELECT * FROM pgbench_accounts WHERE abalance > 0"
|
||||
)
|
||||
jsonplan = cur.fetchall()[0][0]
|
||||
log.debug(jsonplan)
|
||||
# Directly access the 'Plan' part of the first element of the JSON array
|
||||
plan_details = jsonplan[0]["Plan"]
|
||||
|
||||
# Extract "File Cache Hits" and "File Cache Misses"
|
||||
file_cache_hits = plan_details.get("File Cache Hits")
|
||||
file_cache_misses = plan_details.get("File Cache Misses")
|
||||
|
||||
# Now you can assert the values
|
||||
assert file_cache_hits >= 5000, f"Expected File Cache Hits to be > 5000, got {file_cache_hits}"
|
||||
assert file_cache_misses == 0, f"Expected File Cache Misses to be 0, got {file_cache_misses}"
|
||||
@@ -184,13 +184,10 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# NB: the layer file is unlinked index part now, but, because we made the delete
|
||||
# operation stuck, the layer file itself is still in the remote_storage
|
||||
wait_until(
|
||||
10,
|
||||
0.5,
|
||||
lambda: env.pageserver.assert_log_contains(
|
||||
f".*{tenant_id}.*at failpoint.*{failpoint_name}"
|
||||
),
|
||||
)
|
||||
def delete_at_pause_point():
|
||||
assert env.pageserver.log_contains(f".*{tenant_id}.*at failpoint.*{failpoint_name}")
|
||||
|
||||
wait_until(10, 0.5, delete_at_pause_point)
|
||||
future_layer_path = env.pageserver_remote_storage.remote_layer_path(
|
||||
tenant_id, timeline_id, future_layer.to_str(), generation=generation_before_detach
|
||||
)
|
||||
|
||||
@@ -1,74 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
cache_dir = Path(env.repo_dir) / "file_cache"
|
||||
cache_dir.mkdir(exist_ok=True)
|
||||
|
||||
branchname = "test_approximate_working_set_size"
|
||||
env.neon_cli.create_branch(branchname, "empty")
|
||||
log.info(f"Creating endopint with 1MB shared_buffers and 64 MB LFC for branch {branchname}")
|
||||
endpoint = env.endpoints.create_start(
|
||||
branchname,
|
||||
config_lines=[
|
||||
"shared_buffers='1MB'",
|
||||
f"neon.file_cache_path='{cache_dir}/file.cache'",
|
||||
"neon.max_file_cache_size='128MB'",
|
||||
"neon.file_cache_size_limit='64MB'",
|
||||
],
|
||||
)
|
||||
|
||||
cur = endpoint.connect().cursor()
|
||||
cur.execute("create extension neon")
|
||||
|
||||
log.info(f"preparing some data in {endpoint.connstr()}")
|
||||
|
||||
ddl = """
|
||||
CREATE TABLE pgbench_accounts (
|
||||
aid bigint NOT NULL,
|
||||
bid integer,
|
||||
abalance integer,
|
||||
filler character(84),
|
||||
-- more web-app like columns
|
||||
text_column_plain TEXT DEFAULT repeat('NeonIsCool', 5),
|
||||
jsonb_column_extended JSONB DEFAULT ('{ "tell everyone": [' || repeat('{"Neon": "IsCool"},',9) || ' {"Neon": "IsCool"}]}')::jsonb
|
||||
)
|
||||
WITH (fillfactor='100');
|
||||
"""
|
||||
|
||||
cur.execute(ddl)
|
||||
# prepare index access below
|
||||
cur.execute(
|
||||
"ALTER TABLE ONLY pgbench_accounts ADD CONSTRAINT pgbench_accounts_pkey PRIMARY KEY (aid)"
|
||||
)
|
||||
cur.execute(
|
||||
"insert into pgbench_accounts(aid,bid,abalance,filler) select aid, (aid - 1) / 100000 + 1, 0, '' from generate_series(1, 100000) as aid;"
|
||||
)
|
||||
# ensure correct query plans and stats
|
||||
cur.execute("vacuum ANALYZE pgbench_accounts")
|
||||
# determine table size - working set should approximate table size after sequential scan
|
||||
pages = query_scalar(cur, "SELECT relpages FROM pg_class WHERE relname = 'pgbench_accounts'")
|
||||
log.info(f"pgbench_accounts has {pages} pages, resetting working set to zero")
|
||||
cur.execute("select approximate_working_set_size(true)")
|
||||
cur.execute(
|
||||
'SELECT count(*) FROM pgbench_accounts WHERE abalance > 0 or jsonb_column_extended @> \'{"tell everyone": [{"Neon": "IsCool"}]}\'::jsonb'
|
||||
)
|
||||
# verify working set size after sequential scan matches table size and reset working set for next test
|
||||
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
|
||||
log.info(f"working set size after sequential scan on pgbench_accounts {blocks}")
|
||||
assert pages * 0.8 < blocks < pages * 1.2
|
||||
# run a few point queries with index lookup
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 4242")
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 54242")
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 104242")
|
||||
cur.execute("SELECT abalance FROM pgbench_accounts WHERE aid = 204242")
|
||||
# verify working set size after some index access of a few select pages only
|
||||
blocks = query_scalar(cur, "select approximate_working_set_size(true)")
|
||||
log.info(f"working set size after some index access of a few select pages only {blocks}")
|
||||
assert blocks < 10
|
||||
@@ -34,7 +34,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
|
||||
def assert_logged():
|
||||
if not log_expected:
|
||||
return
|
||||
env.pageserver.assert_log_contains(f".*{msg_id}.*")
|
||||
assert env.pageserver.log_contains(f".*{msg_id}.*")
|
||||
|
||||
wait_until(10, 0.5, assert_logged)
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder):
|
||||
# IMPORTANT:
|
||||
# If the version has changed, the test should be updated.
|
||||
# Ensure that the default version is also updated in the neon.control file
|
||||
assert cur.fetchone() == ("1.3",)
|
||||
assert cur.fetchone() == ("1.2",)
|
||||
cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
|
||||
res = cur.fetchall()
|
||||
log.info(res)
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.pg_version import PgVersion, skip_on_postgres
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
|
||||
@skip_on_postgres(
|
||||
PgVersion.V15, reason="skip on pg15 due to https://github.com/neondatabase/neon/issues/6969"
|
||||
)
|
||||
def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
|
||||
env = neon_simple_env
|
||||
env.neon_cli.create_branch("test_neon_superuser_publisher", "empty")
|
||||
|
||||
@@ -432,7 +432,7 @@ def test_deletion_queue_recovery(
|
||||
|
||||
main_pageserver.start()
|
||||
|
||||
def assert_deletions_submitted(n: int) -> None:
|
||||
def assert_deletions_submitted(n: int):
|
||||
assert ps_http.get_metric_value("pageserver_deletion_queue_submitted_total") == n
|
||||
|
||||
# After restart, issue a flush to kick the deletion frontend to do recovery.
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import json
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
@@ -11,7 +10,7 @@ from fixtures.pageserver.utils import (
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
|
||||
from fixtures.types import TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
from fixtures.workload import Workload
|
||||
@@ -437,7 +436,6 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
|
||||
assert env.attachment_service is not None
|
||||
assert isinstance(env.pageserver_remote_storage, S3Storage) # Satisfy linter
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
@@ -493,35 +491,18 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Do evictions on attached pageserver, check secondary follows along
|
||||
# ==================================================================
|
||||
try:
|
||||
log.info("Evicting a layer...")
|
||||
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
|
||||
some_other_layer = list_layers(ps_attached, tenant_id, timeline_id)[1]
|
||||
log.info(f"Victim layer: {layer_to_evict.name}")
|
||||
ps_attached.http_client().evict_layer(
|
||||
tenant_id, timeline_id, layer_name=layer_to_evict.name
|
||||
)
|
||||
log.info("Evicting a layer...")
|
||||
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
|
||||
ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
|
||||
|
||||
log.info("Synchronizing after eviction...")
|
||||
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||
heatmap_after_eviction = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
heatmap_layers = set(
|
||||
layer["name"] for layer in heatmap_after_eviction["timelines"][0]["layers"]
|
||||
)
|
||||
assert layer_to_evict.name not in heatmap_layers
|
||||
assert some_other_layer.name in heatmap_layers
|
||||
log.info("Synchronizing after eviction...")
|
||||
ps_attached.http_client().tenant_heatmap_upload(tenant_id)
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
ps_secondary.http_client().tenant_secondary_download(tenant_id)
|
||||
|
||||
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
except:
|
||||
# On assertion failures, log some details to help with debugging
|
||||
heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id)
|
||||
log.warn(f"heatmap contents: {json.dumps(heatmap,indent=2)}")
|
||||
raise
|
||||
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
|
||||
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
|
||||
ps_secondary, tenant_id, timeline_id
|
||||
)
|
||||
|
||||
# Scrub the remote storage
|
||||
# ========================
|
||||
|
||||
@@ -1,110 +0,0 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnv,
|
||||
NeonEnvBuilder,
|
||||
tenant_get_shards,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.utils import wait_for_last_record_lsn
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
TIMELINE_COUNT = 10
|
||||
ENTRIES_PER_TIMELINE = 10_000
|
||||
CHECKPOINT_TIMEOUT_SECONDS = 60
|
||||
|
||||
TENANT_CONF = {
|
||||
# Large `checkpoint_distance` effectively disables size
|
||||
# based checkpointing.
|
||||
"checkpoint_distance": f"{2 * 1024 ** 3}",
|
||||
"checkpoint_timeout": f"{CHECKPOINT_TIMEOUT_SECONDS}s",
|
||||
}
|
||||
|
||||
|
||||
async def run_worker(env: NeonEnv, entries: int) -> Tuple[TenantId, TimelineId, Lsn]:
|
||||
tenant, timeline = env.neon_cli.create_tenant(conf=TENANT_CONF)
|
||||
with env.endpoints.create_start("main", tenant_id=tenant) as ep:
|
||||
conn = await ep.connect_async()
|
||||
try:
|
||||
await conn.execute("CREATE TABLE IF NOT EXISTS t(key serial primary key, value text)")
|
||||
await conn.execute(
|
||||
f"INSERT INTO t SELECT i, CONCAT('payload_', i) FROM generate_series(0,{entries}) as i"
|
||||
)
|
||||
finally:
|
||||
await conn.close(timeout=10)
|
||||
|
||||
last_flush_lsn = Lsn(ep.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
|
||||
return tenant, timeline, last_flush_lsn
|
||||
|
||||
|
||||
async def workload(
|
||||
env: NeonEnv, timelines: int, entries: int
|
||||
) -> list[Tuple[TenantId, TimelineId, Lsn]]:
|
||||
workers = [asyncio.create_task(run_worker(env, entries)) for _ in range(timelines)]
|
||||
return await asyncio.gather(*workers)
|
||||
|
||||
|
||||
def wait_until_pageserver_is_caught_up(
|
||||
env: NeonEnv, last_flush_lsns: list[Tuple[TenantId, TimelineId, Lsn]]
|
||||
):
|
||||
for tenant, timeline, last_flush_lsn in last_flush_lsns:
|
||||
shards = tenant_get_shards(env, tenant)
|
||||
for tenant_shard_id, pageserver in shards:
|
||||
waited = wait_for_last_record_lsn(
|
||||
pageserver.http_client(), tenant_shard_id, timeline, last_flush_lsn
|
||||
)
|
||||
assert waited >= last_flush_lsn
|
||||
|
||||
|
||||
def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
|
||||
def query():
|
||||
value = pageserver_http.get_metric_value("pageserver_wal_ingest_records_received_total")
|
||||
assert value is not None
|
||||
return value
|
||||
|
||||
# The metric gets initialised on the first update.
|
||||
# Retry a few times, but return 0 if it's stable.
|
||||
try:
|
||||
return float(wait_until(3, 0.5, query))
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("immediate_shutdown", [True, False])
|
||||
def test_pageserver_small_inmemory_layers(
|
||||
neon_env_builder: NeonEnvBuilder, immediate_shutdown: bool
|
||||
):
|
||||
"""
|
||||
Test that open layers get flushed after the `checkpoint_timeout` config
|
||||
and do not require WAL reingest upon restart.
|
||||
|
||||
The workload creates a number of timelines and writes some data to each,
|
||||
but not enough to trigger flushes via the `checkpoint_distance` config.
|
||||
"""
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
last_flush_lsns = asyncio.run(workload(env, TIMELINE_COUNT, ENTRIES_PER_TIMELINE))
|
||||
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
|
||||
|
||||
ps_http_client = env.pageserver.http_client()
|
||||
total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
|
||||
|
||||
log.info("Sleeping for checkpoint timeout ...")
|
||||
time.sleep(CHECKPOINT_TIMEOUT_SECONDS + 5)
|
||||
|
||||
env.pageserver.restart(immediate=immediate_shutdown)
|
||||
wait_until_pageserver_is_caught_up(env, last_flush_lsns)
|
||||
|
||||
total_wal_ingested_after_restart = wait_for_wal_ingest_metric(ps_http_client)
|
||||
|
||||
log.info(f"WAL ingested before restart: {total_wal_ingested_before_restart}")
|
||||
log.info(f"WAL ingested after restart: {total_wal_ingested_after_restart}")
|
||||
|
||||
leeway = total_wal_ingested_before_restart * 5 / 100
|
||||
assert total_wal_ingested_after_restart <= leeway
|
||||
@@ -28,14 +28,7 @@ from fixtures.remote_storage import (
|
||||
available_remote_storages,
|
||||
)
|
||||
from fixtures.types import Lsn, TenantId, TimelineId
|
||||
from fixtures.utils import (
|
||||
assert_eq,
|
||||
assert_ge,
|
||||
assert_gt,
|
||||
print_gc_result,
|
||||
query_scalar,
|
||||
wait_until,
|
||||
)
|
||||
from fixtures.utils import print_gc_result, query_scalar, wait_until
|
||||
from requests import ReadTimeout
|
||||
|
||||
|
||||
@@ -127,10 +120,10 @@ def test_remote_storage_backup_and_restore(
|
||||
log.info(f"upload of checkpoint {checkpoint_number} is done")
|
||||
|
||||
# Check that we had to retry the uploads
|
||||
env.pageserver.assert_log_contains(
|
||||
assert env.pageserver.log_contains(
|
||||
".*failed to perform remote task UploadLayer.*, will retry.*"
|
||||
)
|
||||
env.pageserver.assert_log_contains(
|
||||
assert env.pageserver.log_contains(
|
||||
".*failed to perform remote task UploadMetadata.*, will retry.*"
|
||||
)
|
||||
|
||||
@@ -299,9 +292,9 @@ def test_remote_storage_upload_queue_retries(
|
||||
print_gc_result(gc_result)
|
||||
assert gc_result["layers_removed"] > 0
|
||||
|
||||
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
|
||||
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
|
||||
wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
|
||||
wait_until(2, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
|
||||
|
||||
# let all future operations queue up
|
||||
configure_storage_sync_failpoints("return")
|
||||
@@ -329,17 +322,17 @@ def test_remote_storage_upload_queue_retries(
|
||||
churn_while_failpoints_active_thread.start()
|
||||
|
||||
# wait for churn thread's data to get stuck in the upload queue
|
||||
wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
|
||||
wait_until(10, 0.5, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 2))
|
||||
wait_until(10, 0.5, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="delete"), 0))
|
||||
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="upload") > 0)
|
||||
wait_until(10, 0.1, lambda: get_queued_count(file_kind="index", op_kind="upload") >= 2)
|
||||
wait_until(10, 0.1, lambda: get_queued_count(file_kind="layer", op_kind="delete") > 0)
|
||||
|
||||
# unblock churn operations
|
||||
configure_storage_sync_failpoints("off")
|
||||
|
||||
# ... and wait for them to finish. Exponential back-off in upload queue, so, gracious timeouts.
|
||||
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
|
||||
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
|
||||
wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="upload") == 0)
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="index", op_kind="upload") == 0)
|
||||
wait_until(30, 1, lambda: get_queued_count(file_kind="layer", op_kind="delete") == 0)
|
||||
|
||||
# The churn thread doesn't make progress once it blocks on the first wait_completion() call,
|
||||
# so, give it some time to wrap up.
|
||||
@@ -891,23 +884,26 @@ def wait_upload_queue_empty(
|
||||
wait_until(
|
||||
2,
|
||||
1,
|
||||
lambda: assert_eq(
|
||||
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
|
||||
),
|
||||
lambda: get_queued_count(
|
||||
client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"
|
||||
)
|
||||
== 0,
|
||||
)
|
||||
wait_until(
|
||||
2,
|
||||
1,
|
||||
lambda: assert_eq(
|
||||
get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
|
||||
),
|
||||
lambda: get_queued_count(
|
||||
client, tenant_id, timeline_id, file_kind="index", op_kind="upload"
|
||||
)
|
||||
== 0,
|
||||
)
|
||||
wait_until(
|
||||
2,
|
||||
1,
|
||||
lambda: assert_eq(
|
||||
get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
|
||||
),
|
||||
lambda: get_queued_count(
|
||||
client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"
|
||||
)
|
||||
== 0,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ def test_sharding_service_smoke(
|
||||
# Marking a pageserver offline should migrate tenants away from it.
|
||||
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
||||
|
||||
def node_evacuated(node_id: int) -> None:
|
||||
def node_evacuated(node_id: int):
|
||||
counts = get_node_shard_counts(env, tenant_ids)
|
||||
assert counts[node_id] == 0
|
||||
|
||||
@@ -146,8 +146,6 @@ def test_sharding_service_smoke(
|
||||
for tid in tenant_ids:
|
||||
tenant_delete_wait_completed(env.attachment_service.pageserver_api(), tid, 10)
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
# Set a scheduling policy on one node, create all the tenants, observe
|
||||
# that the scheduling policy is respected.
|
||||
env.attachment_service.node_configure(env.pageservers[1].id, {"scheduling": "Draining"})
|
||||
@@ -258,8 +256,9 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder):
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("warm_up", [True, False])
|
||||
def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
|
||||
def test_sharding_service_onboarding(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
"""
|
||||
We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
|
||||
which provides the /location_config API. This is similar to creating a tenant,
|
||||
@@ -307,23 +306,6 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
|
||||
},
|
||||
)
|
||||
|
||||
if warm_up:
|
||||
origin_ps.http_client().tenant_heatmap_upload(tenant_id)
|
||||
|
||||
# We expect to be called via live migration code, which may try to configure the tenant into secondary
|
||||
# mode before attaching it.
|
||||
virtual_ps_http.tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "Secondary",
|
||||
"secondary_conf": {"warm": True},
|
||||
"tenant_conf": {},
|
||||
"generation": None,
|
||||
},
|
||||
)
|
||||
|
||||
virtual_ps_http.tenant_secondary_download(tenant_id)
|
||||
|
||||
# Call into attachment service to onboard the tenant
|
||||
generation += 1
|
||||
virtual_ps_http.tenant_location_conf(
|
||||
@@ -369,9 +351,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
|
||||
assert len(dest_tenants) == 1
|
||||
assert TenantId(dest_tenants[0]["id"]) == tenant_id
|
||||
|
||||
# sharding service advances generation by 1 when it first attaches. We started
|
||||
# with a nonzero generation so this equality also proves that the generation
|
||||
# was properly carried over during onboarding.
|
||||
# sharding service advances generation by 1 when it first attaches
|
||||
assert dest_tenants[0]["generation"] == generation + 1
|
||||
|
||||
# The onboarded tenant should survive a restart of sharding service
|
||||
@@ -382,31 +362,6 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up:
|
||||
dest_ps.stop()
|
||||
dest_ps.start()
|
||||
|
||||
# Having onboarded via /location_config, we should also be able to update the
|
||||
# TenantConf part of LocationConf, without inadvertently resetting the generation
|
||||
modified_tenant_conf = {"max_lsn_wal_lag": 1024 * 1024 * 1024 * 100}
|
||||
dest_tenant_before_conf_change = dest_ps.http_client().tenant_status(tenant_id)
|
||||
|
||||
# The generation has moved on since we onboarded
|
||||
assert generation != dest_tenant_before_conf_change["generation"]
|
||||
|
||||
virtual_ps_http.tenant_location_conf(
|
||||
tenant_id,
|
||||
{
|
||||
"mode": "AttachedSingle",
|
||||
"secondary_conf": None,
|
||||
"tenant_conf": modified_tenant_conf,
|
||||
# This is intentionally a stale generation
|
||||
"generation": generation,
|
||||
},
|
||||
)
|
||||
dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id)
|
||||
assert (
|
||||
dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"]
|
||||
)
|
||||
dest_tenant_conf_after = dest_ps.http_client().tenant_config(tenant_id)
|
||||
assert dest_tenant_conf_after.tenant_specific_overrides == modified_tenant_conf
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
|
||||
@@ -450,7 +405,7 @@ def test_sharding_service_compute_hook(
|
||||
|
||||
env.attachment_service.node_configure(env.pageservers[0].id, {"availability": "Offline"})
|
||||
|
||||
def node_evacuated(node_id: int) -> None:
|
||||
def node_evacuated(node_id: int):
|
||||
counts = get_node_shard_counts(env, [env.initial_tenant])
|
||||
assert counts[node_id] == 0
|
||||
|
||||
@@ -712,41 +667,3 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
|
||||
svc.request(
|
||||
"POST", f"{api}/upcall/v1/re-attach", headers=svc.headers(TokenScope.PAGE_SERVER_API)
|
||||
)
|
||||
|
||||
|
||||
def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without
|
||||
supplying the whole LocationConf.
|
||||
"""
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
http = env.attachment_service.pageserver_api()
|
||||
|
||||
default_value = "7days"
|
||||
new_value = "1h"
|
||||
http.set_tenant_config(tenant_id, {"pitr_interval": new_value})
|
||||
|
||||
# Ensure the change landed on the storage controller
|
||||
readback_controller = http.tenant_config(tenant_id)
|
||||
assert readback_controller.effective_config["pitr_interval"] == new_value
|
||||
assert readback_controller.tenant_specific_overrides["pitr_interval"] == new_value
|
||||
|
||||
# Ensure the change made it down to the pageserver
|
||||
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
|
||||
assert readback_ps.effective_config["pitr_interval"] == new_value
|
||||
assert readback_ps.tenant_specific_overrides["pitr_interval"] == new_value
|
||||
|
||||
# Omitting a value clears it. This looks different in storage controller
|
||||
# vs. pageserver API calls, because pageserver has defaults.
|
||||
http.set_tenant_config(tenant_id, {})
|
||||
readback_controller = http.tenant_config(tenant_id)
|
||||
assert readback_controller.effective_config["pitr_interval"] is None
|
||||
assert readback_controller.tenant_specific_overrides["pitr_interval"] is None
|
||||
readback_ps = env.pageservers[0].http_client().tenant_config(tenant_id)
|
||||
assert readback_ps.effective_config["pitr_interval"] == default_value
|
||||
assert "pitr_interval" not in readback_ps.tenant_specific_overrides
|
||||
|
||||
env.attachment_service.consistency_check()
|
||||
|
||||
@@ -270,7 +270,7 @@ eviction_policy = { "kind" = "LayerAccessThreshold", period = "20s", threshold =
|
||||
"period": "20s",
|
||||
"threshold": "23h",
|
||||
}
|
||||
assert final_effective_config["max_lsn_wal_lag"] == 1024 * 1024 * 1024
|
||||
assert final_effective_config["max_lsn_wal_lag"] == 10 * 1024 * 1024
|
||||
|
||||
# restart the pageserver and ensure that the config is still correct
|
||||
env.pageserver.stop()
|
||||
|
||||
@@ -505,10 +505,10 @@ def test_tenant_delete_concurrent(
|
||||
return ps_http.tenant_delete(tenant_id)
|
||||
|
||||
def hit_remove_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
|
||||
assert env.pageserver.log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")
|
||||
|
||||
def hit_run_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
|
||||
assert env.pageserver.log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
background_200_req = executor.submit(delete_tenant)
|
||||
@@ -612,12 +612,12 @@ def test_tenant_delete_races_timeline_creation(
|
||||
Thread(target=timeline_create).start()
|
||||
|
||||
def hit_initdb_upload_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
assert env.pageserver.log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
|
||||
|
||||
wait_until(100, 0.1, hit_initdb_upload_failpoint)
|
||||
|
||||
def creation_connection_timed_out():
|
||||
env.pageserver.assert_log_contains(
|
||||
assert env.pageserver.log_contains(
|
||||
"POST.*/timeline.* request was dropped before completing"
|
||||
)
|
||||
|
||||
@@ -636,7 +636,7 @@ def test_tenant_delete_races_timeline_creation(
|
||||
Thread(target=tenant_delete).start()
|
||||
|
||||
def deletion_arrived():
|
||||
env.pageserver.assert_log_contains(
|
||||
assert env.pageserver.log_contains(
|
||||
f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
|
||||
)
|
||||
|
||||
@@ -663,7 +663,7 @@ def test_tenant_delete_races_timeline_creation(
|
||||
)
|
||||
|
||||
# Ensure that creation cancelled and deletion didn't end up in broken state or encountered the leftover temp file
|
||||
env.pageserver.assert_log_contains(CANCELLED_ERROR)
|
||||
assert env.pageserver.log_contains(CANCELLED_ERROR)
|
||||
assert not env.pageserver.log_contains(
|
||||
".*ERROR.*delete_tenant.*Timelines directory is not empty after all timelines deletion"
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user