Compare commits

...

29 Commits

Author SHA1 Message Date
John Spray
3107010f7c pageserver: error type for collect_keyspace 2023-11-10 11:36:13 +00:00
John Spray
ebf151f53a f http routes 2023-11-10 11:05:41 +00:00
John Spray
6901bf465e Use secondary mode bits in neon_local migration 2023-11-10 11:05:12 +00:00
John Spray
9c52a30f7a Merge branch 'jcsp/neon-local-migration' into jcsp/secondary-locations-pt3 2023-11-09 20:15:36 +00:00
John Spray
c4bd1006e8 tests: set a backoff_factor for test HTTP retries
Previously we were rapid-retrying, which worked when
doing local loads for activating tenants, but gives us
very little time for tenants that are actually taking
some time to activate, e.g. in tests that use fault
injection for remote storage, or lots of layers.
2023-11-09 20:15:10 +00:00
John Spray
c5f7c6bd5a pageserver: add debug_assert_current_span_has_tenant_id 2023-11-09 20:15:10 +00:00
John Spray
028b57a6e2 pageserver: include secondary tenants in disk usage eviction 2023-11-09 17:33:59 +00:00
John Spray
b55e295576 pageserver: pass TenantManager into disk usage eviction task 2023-11-09 17:33:59 +00:00
John Spray
d9e28076c1 pageserver: add Layer::for_secondary 2023-11-09 17:33:59 +00:00
John Spray
8506008d95 pageserver: create timelines/ dir when configuring secondary location 2023-11-09 17:33:59 +00:00
John Spray
8135cd755d pageserver/http: add testing routes for secondary mode 2023-11-09 17:33:59 +00:00
John Spray
afc8315dad pageserver: implement flush on location conf update 2023-11-09 17:33:59 +00:00
John Spray
33a66b963f tests: add tests for secondary mode & multi attach 2023-11-09 17:33:59 +00:00
John Spray
ce29d1588b tests: add helpers for location_conf 2023-11-09 17:33:59 +00:00
John Spray
30d98307cb tests: refactor a path helper 2023-11-09 17:33:59 +00:00
John Spray
56fe8ceb97 Refactor Workload into shared location 2023-11-09 17:33:59 +00:00
John Spray
80e18acd8c pageserver: launch tasks for secondary mode 2023-11-09 17:33:59 +00:00
John Spray
c7dd9d930a Add Timeline::generate_heatmap, remote client heatmap upload 2023-11-09 17:33:59 +00:00
John Spray
06e0370786 pageserver: TenantManager support for SecondaryTenant 2023-11-09 17:33:58 +00:00
John Spray
c60ec31509 pageserver: add secondary downloader & heatmaps 2023-11-09 17:33:26 +00:00
John Spray
fc3c7aaa6c pageserver: add TenantConf::enable_heatmap (default false) 2023-11-09 17:33:26 +00:00
John Spray
6f0a225776 pageserver: add secondary-mode metrics 2023-11-09 17:33:26 +00:00
John Spray
497b09ba0d control_plane: use pageserver/ sub path for local_fs 2023-11-09 10:29:38 +00:00
John Spray
b698bc90e0 tests: explicitly disable remote storage if it is None 2023-11-09 10:29:38 +00:00
John Spray
11e9916e91 Use the same local_fs path as tests 2023-11-09 10:29:38 +00:00
John Spray
28f752ff46 neon_local: don't use default remote storage if user specified it on CLI 2023-11-09 10:29:38 +00:00
John Spray
c27c7d8b6a control_plane: implement cargo neon migrate 2023-11-09 10:29:38 +00:00
John Spray
c7e66687f0 control_plane: add /inspect to attachment service 2023-11-09 10:29:38 +00:00
John Spray
9fd901b7b2 neon_local: remote storage by default & enable multiple pageservers in
init
2023-11-09 09:21:43 +00:00
34 changed files with 3399 additions and 205 deletions

View File

@@ -24,6 +24,16 @@ pub struct AttachHookResponse {
pub gen: Option<u32>,
}
#[derive(Serialize, Deserialize)]
pub struct InspectRequest {
pub tenant_id: TenantId,
}
#[derive(Serialize, Deserialize)]
pub struct InspectResponse {
pub attachment: Option<(u32, NodeId)>,
}
impl AttachmentService {
pub fn from_env(env: &LocalEnv) -> Self {
let path = env.base_data_dir.join("attachments.json");
@@ -101,4 +111,29 @@ impl AttachmentService {
let response = response.json::<AttachHookResponse>()?;
Ok(response.gen)
}
pub fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
use hyper::StatusCode;
let url = self
.env
.control_plane_api
.clone()
.unwrap()
.join("inspect")
.unwrap();
let client = reqwest::blocking::ClientBuilder::new()
.build()
.expect("Failed to construct http client");
let request = InspectRequest { tenant_id };
let response = client.post(url).json(&request).send()?;
if response.status() != StatusCode::OK {
return Err(anyhow!("Unexpected status {}", response.status()));
}
let response = response.json::<InspectResponse>()?;
Ok(response.attachment)
}
}

View File

@@ -32,7 +32,9 @@ use pageserver_api::control_api::{
ValidateResponseTenant,
};
use control_plane::attachment_service::{AttachHookRequest, AttachHookResponse};
use control_plane::attachment_service::{
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
};
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
@@ -255,12 +257,28 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
)
}
async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
let inspect_req = json_request::<InspectRequest>(&mut req).await?;
let state = get_state(&req).inner.clone();
let locked = state.write().await;
let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
json_response(
StatusCode::OK,
InspectResponse {
attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
},
)
}
fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
endpoint::make_router()
.data(Arc::new(State::new(persistent_state)))
.post("/re-attach", |r| request_span(r, handle_re_attach))
.post("/validate", |r| request_span(r, handle_validate))
.post("/attach-hook", |r| request_span(r, handle_attach_hook))
.post("/inspect", |r| request_span(r, handle_inspect))
}
#[tokio::main]

View File

@@ -11,13 +11,14 @@ use compute_api::spec::ComputeMode;
use control_plane::attachment_service::AttachmentService;
use control_plane::endpoint::ComputeControlPlane;
use control_plane::local_env::LocalEnv;
use control_plane::pageserver::PageServerNode;
use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
use control_plane::safekeeper::SafekeeperNode;
use control_plane::tenant_migration::migrate_tenant;
use control_plane::{broker, local_env};
use pageserver_api::models::TimelineInfo;
use pageserver_api::{
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
};
use postgres_backend::AuthType;
use safekeeper_api::{
@@ -46,8 +47,8 @@ const DEFAULT_PG_VERSION: &str = "15";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/";
fn default_conf() -> String {
format!(
fn default_conf(num_pageservers: u16) -> String {
let mut template = format!(
r#"
# Default built-in configuration, defined in main.rs
control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
@@ -55,21 +56,33 @@ control_plane_api = '{DEFAULT_PAGESERVER_CONTROL_PLANE_API}'
[broker]
listen_addr = '{DEFAULT_BROKER_ADDR}'
[[pageservers]]
id = {DEFAULT_PAGESERVER_ID}
listen_pg_addr = '{DEFAULT_PAGESERVER_PG_ADDR}'
listen_http_addr = '{DEFAULT_PAGESERVER_HTTP_ADDR}'
pg_auth_type = '{trust_auth}'
http_auth_type = '{trust_auth}'
[[safekeepers]]
id = {DEFAULT_SAFEKEEPER_ID}
pg_port = {DEFAULT_SAFEKEEPER_PG_PORT}
http_port = {DEFAULT_SAFEKEEPER_HTTP_PORT}
"#,
trust_auth = AuthType::Trust,
)
);
for i in 0..num_pageservers {
let pageserver_id = NodeId(DEFAULT_PAGESERVER_ID.0 + i as u64);
let pg_port = DEFAULT_PAGESERVER_PG_PORT + i;
let http_port = DEFAULT_PAGESERVER_HTTP_PORT + i;
template += &format!(
r#"
[[pageservers]]
id = {pageserver_id}
listen_pg_addr = '127.0.0.1:{pg_port}'
listen_http_addr = '127.0.0.1:{http_port}'
pg_auth_type = '{trust_auth}'
http_auth_type = '{trust_auth}'
"#,
trust_auth = AuthType::Trust,
)
}
template
}
///
@@ -295,6 +308,9 @@ fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId
}
fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
let num_pageservers = init_match
.get_one::<u16>("num-pageservers")
.expect("num-pageservers arg has a default");
// Create config file
let toml_file: String = if let Some(config_path) = init_match.get_one::<PathBuf>("config") {
// load and parse the file
@@ -306,7 +322,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
})?
} else {
// Built-in default config
default_conf()
default_conf(*num_pageservers)
};
let pg_version = init_match
@@ -320,6 +336,9 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
env.init(pg_version, force)
.context("Failed to initialize neon repository")?;
// Create remote storage location for default LocalFs remote storage
std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
// Initialize pageserver, create initial tenant and timeline.
for ps_conf in &env.pageservers {
PageServerNode::from_env(&env, ps_conf)
@@ -433,6 +452,15 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
.with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
println!("tenant {tenant_id} successfully configured on the pageserver");
}
Some(("migrate", matches)) => {
let tenant_id = get_tenant_id(matches, env)?;
let new_pageserver = get_pageserver(env, matches)?;
let new_pageserver_id = new_pageserver.conf.id;
migrate_tenant(env, tenant_id, new_pageserver)?;
println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
}
Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
None => bail!("no tenant subcommand provided"),
}
@@ -867,20 +895,20 @@ fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Res
}
}
fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
NodeId(id_str.parse().context("while parsing pageserver id")?)
} else {
DEFAULT_PAGESERVER_ID
};
Ok(PageServerNode::from_env(
env,
env.get_pageserver_conf(node_id)?,
))
}
fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
NodeId(id_str.parse().context("while parsing pageserver id")?)
} else {
DEFAULT_PAGESERVER_ID
};
Ok(PageServerNode::from_env(
env,
env.get_pageserver_conf(node_id)?,
))
}
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?
@@ -917,6 +945,20 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul
}
}
Some(("migrate", subcommand_args)) => {
let pageserver = get_pageserver(env, subcommand_args)?;
//TODO what shutdown strategy should we use here?
if let Err(e) = pageserver.stop(false) {
eprintln!("pageserver stop failed: {}", e);
exit(1);
}
if let Err(e) = pageserver.start(&pageserver_config_overrides(subcommand_args)) {
eprintln!("pageserver start failed: {e}");
exit(1);
}
}
Some(("status", subcommand_args)) => {
match get_pageserver(env, subcommand_args)?.check_status() {
Ok(_) => println!("Page server is up and running"),
@@ -1224,6 +1266,13 @@ fn cli() -> Command {
.help("Force initialization even if the repository is not empty")
.required(false);
let num_pageservers_arg = Arg::new("num-pageservers")
.value_parser(value_parser!(u16))
.long("num-pageservers")
.help("How many pageservers to create (default 1)")
.required(false)
.default_value("1");
Command::new("Neon CLI")
.arg_required_else_help(true)
.version(GIT_VERSION)
@@ -1231,6 +1280,7 @@ fn cli() -> Command {
Command::new("init")
.about("Initialize a new Neon repository, preparing configs for services to start with")
.arg(pageserver_config_args.clone())
.arg(num_pageservers_arg.clone())
.arg(
Arg::new("config")
.long("config")
@@ -1301,6 +1351,10 @@ fn cli() -> Command {
.subcommand(Command::new("config")
.arg(tenant_id_arg.clone())
.arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
.subcommand(Command::new("migrate")
.about("Migrate a tenant from one pageserver to another")
.arg(tenant_id_arg.clone())
.arg(pageserver_id_arg.clone()))
)
.subcommand(
Command::new("pageserver")

View File

@@ -14,3 +14,4 @@ pub mod local_env;
pub mod pageserver;
pub mod postgresql_conf;
pub mod safekeeper;
pub mod tenant_migration;

View File

@@ -11,11 +11,14 @@ use std::io::{BufReader, Write};
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::{Child, Command};
use std::time::Duration;
use std::{io, result};
use anyhow::{bail, Context};
use camino::Utf8PathBuf;
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
use pageserver_api::models::{
self, LocationConfig, TenantInfo, TenantLocationConfigRequest, TimelineInfo,
};
use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use reqwest::blocking::{Client, RequestBuilder, Response};
@@ -31,6 +34,9 @@ use utils::{
use crate::local_env::PageServerConf;
use crate::{background_process, local_env::LocalEnv};
/// Directory within .neon which will be used by default for LocalFs remote storage.
pub const PAGESERVER_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/pageserver";
#[derive(Error, Debug)]
pub enum PageserverHttpError {
#[error("Reqwest error: {0}")]
@@ -98,8 +104,10 @@ impl PageServerNode {
}
}
// pageserver conf overrides defined by neon_local configuration.
fn neon_local_overrides(&self) -> Vec<String> {
/// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
///
/// These all end up on the command line of the `pageserver` binary.
fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
let id = format!("id={}", self.conf.id);
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
let pg_distrib_dir_param = format!(
@@ -132,12 +140,25 @@ impl PageServerNode {
));
}
if !cli_overrides
.iter()
.any(|c| c.starts_with("remote_storage"))
{
overrides.push(format!(
"remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
));
}
if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
{
// Keys are generated in the toplevel repo dir, pageservers' workdirs
// are one level below that, so refer to keys with ../
overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
}
// Apply the user-provided overrides
overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));
overrides
}
@@ -203,9 +224,6 @@ impl PageServerNode {
}
fn start_node(&self, config_overrides: &[&str], update_config: bool) -> anyhow::Result<Child> {
let mut overrides = self.neon_local_overrides();
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
let datadir = self.repo_path();
print!(
"Starting pageserver node {} at '{}' in {:?}",
@@ -248,8 +266,7 @@ impl PageServerNode {
) -> Vec<Cow<'a, str>> {
let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
let mut overrides = self.neon_local_overrides();
overrides.extend(config_overrides.iter().map(|&c| c.to_owned()));
let overrides = self.neon_local_overrides(config_overrides);
for config_override in overrides {
args.push(Cow::Borrowed("-c"));
args.push(Cow::Owned(config_override));
@@ -389,6 +406,11 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
enable_heatmap: settings
.remove("enable_heatmap")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'enable_heatmap' as bool")?,
};
let request = models::TenantCreateRequest {
@@ -486,6 +508,11 @@ impl PageServerNode {
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'gc_feedback' as bool")?,
enable_heatmap: settings
.remove("enable_heatmap")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'enable_heatmap' as bool")?,
}
};
@@ -501,6 +528,43 @@ impl PageServerNode {
Ok(())
}
pub fn location_config(
&self,
tenant_id: TenantId,
config: LocationConfig,
flush_ms: Option<Duration>,
) -> anyhow::Result<()> {
let req_body = TenantLocationConfigRequest { tenant_id, config };
let path = format!(
"{}/tenant/{}/location_config",
self.http_base_url, tenant_id
);
let path = if let Some(flush_ms) = flush_ms {
format!("{}?flush_ms={}", path, flush_ms.as_millis())
} else {
path
};
self.http_request(Method::PUT, path)?
.json(&req_body)
.send()?
.error_from_body()?;
Ok(())
}
pub fn secondary_download(&self, tenant_id: TenantId) -> anyhow::Result<()> {
self.http_request(
Method::POST,
format!("{}/secondary/{}/download", self.http_base_url, tenant_id),
)?
.send()?
.error_from_body()?;
Ok(())
}
pub fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
let timeline_infos: Vec<TimelineInfo> = self
.http_request(

View File

@@ -0,0 +1,214 @@
//!
//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
//! point to the new pageserver.
//!
use crate::local_env::LocalEnv;
use crate::{
attachment_service::AttachmentService, endpoint::ComputeControlPlane,
pageserver::PageServerNode,
};
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
};
use std::collections::HashMap;
use std::time::Duration;
use utils::{
generation::Generation,
id::{TenantId, TimelineId},
lsn::Lsn,
};
/// Given an attached pageserver, retrieve the LSN for all timelines
fn get_lsns(
tenant_id: TenantId,
pageserver: &PageServerNode,
) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
let timelines = pageserver.timeline_list(&tenant_id)?;
Ok(timelines
.into_iter()
.map(|t| (t.timeline_id, t.last_record_lsn))
.collect())
}
/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
/// `baseline`.
fn await_lsn(
tenant_id: TenantId,
pageserver: &PageServerNode,
baseline: HashMap<TimelineId, Lsn>,
) -> anyhow::Result<()> {
loop {
let latest = match get_lsns(tenant_id, pageserver) {
Ok(l) => l,
Err(_e) => {
println!(
"🕑 Waiting for pageserver {} to activate...",
pageserver.conf.id
);
std::thread::sleep(Duration::from_millis(500));
continue;
}
};
let mut any_behind: bool = false;
for (timeline_id, baseline_lsn) in &baseline {
match latest.get(timeline_id) {
Some(latest_lsn) => {
println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
if latest_lsn < baseline_lsn {
any_behind = true;
}
}
None => {
// Expected timeline isn't yet visible on migration destination.
// (IRL we would have to account for timeline deletion, but this
// is just test helper)
any_behind = true;
}
}
}
if !any_behind {
println!("✅ LSN caught up. Proceeding...");
break;
} else {
std::thread::sleep(Duration::from_millis(500));
}
}
Ok(())
}
/// This function spans multiple services, to demonstrate live migration of a tenant
/// between pageservers:
/// - Coordinate attach/secondary/detach on pageservers
/// - call into attachment_service for generations
/// - reconfigure compute endpoints to point to new attached pageserver
pub fn migrate_tenant(
env: &LocalEnv,
tenant_id: TenantId,
dest_ps: PageServerNode,
) -> anyhow::Result<()> {
println!("🤔 Checking existing status...");
let attachment_service = AttachmentService::from_env(env);
let previous = attachment_service.inspect(tenant_id)?;
let mut baseline_lsns = None;
if let Some((generation, origin_ps_id)) = &previous {
let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
if origin_ps_id == &dest_ps.conf.id {
println!("🔁 Already attached to {origin_ps_id}, freshening...");
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
dest_ps.location_config(tenant_id, dest_conf, None)?;
println!("✅ Migration complete");
return Ok(());
}
println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
let stale_conf = LocationConfig {
mode: LocationConfigMode::AttachedStale,
generation: Some(Generation::new(*generation)),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
}
println!(
"🔁 Downloading latest layers to destination pageserver {}",
dest_ps.conf.id
);
match dest_ps.secondary_download(tenant_id) {
Ok(()) => {}
Err(_) => {
println!(" (skipping, destination wasn't in secondary mode)")
}
}
// Get a new generation
let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedMulti,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
dest_ps.location_config(tenant_id, dest_conf, None)?;
if let Some(baseline) = baseline_lsns {
println!("🕑 Waiting for LSN to catch up...");
await_lsn(tenant_id, &dest_ps, baseline)?;
}
let cplane = ComputeControlPlane::load(env.clone())?;
for (endpoint_name, endpoint) in &cplane.endpoints {
if endpoint.tenant_id == tenant_id {
println!(
"🔁 Reconfiguring endpoint {} to use pageserver {}",
endpoint_name, dest_ps.conf.id
);
endpoint.reconfigure(Some(dest_ps.conf.id))?;
}
}
for other_ps_conf in &env.pageservers {
if other_ps_conf.id == dest_ps.conf.id {
continue;
}
let other_ps = PageServerNode::from_env(env, other_ps_conf);
let other_ps_tenants = other_ps.tenant_list()?;
// Check if this tenant is attached
let found = other_ps_tenants
.into_iter()
.map(|t| t.id)
.any(|i| i == tenant_id);
if !found {
continue;
}
// Downgrade to a secondary location
let secondary_conf = LocationConfig {
mode: LocationConfigMode::Secondary,
generation: None,
secondary_conf: Some(LocationConfigSecondary { warm: true }),
tenant_conf: TenantConfig::default(),
};
println!(
"💤 Switching to secondary mode on pageserver {}",
other_ps.conf.id
);
other_ps.location_config(tenant_id, secondary_conf, None)?;
}
println!(
"🔁 Switching to AttachedSingle mode on pageserver {}",
dest_ps.conf.id
);
let dest_conf = LocationConfig {
mode: LocationConfigMode::AttachedSingle,
generation: gen.map(Generation::new),
secondary_conf: None,
tenant_conf: TenantConfig::default(),
};
dest_ps.location_config(tenant_id, dest_conf, None)?;
println!("✅ Migration complete");
Ok(())
}

View File

@@ -236,6 +236,7 @@ pub struct TenantConfig {
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub gc_feedback: Option<bool>,
pub enable_heatmap: Option<bool>,
}
/// A flattened analog of a `pagesever::tenant::LocationMode`, which
@@ -324,6 +325,7 @@ impl TenantConfigRequest {
min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: None,
gc_feedback: None,
enable_heatmap: None,
};
TenantConfigRequest { tenant_id, config }
}

View File

@@ -5,6 +5,7 @@ use std::{sync::Arc, time::Duration};
/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
/// the resource calls `close()` when they want to ensure that all holders of guards
/// have released them, and that no future guards will be issued.
#[derive(Debug)]
pub struct Gate {
/// Each caller of enter() takes one unit from the semaphore. In close(), we
/// take all the units to ensure all GateGuards are destroyed.

View File

@@ -14,7 +14,7 @@ use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::TenantSharedResources;
use pageserver::tenant::{secondary, TenantSharedResources};
use remote_storage::GenericRemoteStorage;
use tokio::time::Instant;
use tracing::*;
@@ -528,6 +528,18 @@ fn start_pageserver(
}
});
let secondary_controller = if let Some(remote_storage) = &remote_storage {
secondary::spawn_tasks(
conf,
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
shutdown_pageserver.clone(),
)
} else {
secondary::null_controller()
};
// shared state between the disk-usage backed eviction background task and the http endpoint
// that allows triggering disk-usage based eviction manually. note that the http endpoint
// is still accessible even if background task is not configured as long as remote storage has
@@ -539,6 +551,7 @@ fn start_pageserver(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
tenant_manager.clone(),
background_jobs_barrier.clone(),
)?;
}
@@ -557,6 +570,7 @@ fn start_pageserver(
broker_client.clone(),
disk_usage_eviction_state,
deletion_queue.new_client(),
secondary_controller,
)
.context("Failed to initialize router state")?,
);

View File

@@ -48,20 +48,23 @@ use std::{
};
use anyhow::Context;
use camino::Utf8Path;
use remote_storage::GenericRemoteStorage;
use serde::{Deserialize, Serialize};
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::{debug, error, info, instrument, warn, Instrument};
use utils::completion;
use utils::serde_percent::Percent;
use utils::{
completion,
id::{TenantId, TenantTimelineId},
};
use utils::{id::TimelineId, serde_percent::Percent};
use crate::{
config::PageServerConf,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
self,
mgr::TenantManager,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer},
Timeline,
},
@@ -87,6 +90,7 @@ pub fn launch_disk_usage_global_eviction_task(
conf: &'static PageServerConf,
storage: GenericRemoteStorage,
state: Arc<State>,
tenant_manager: Arc<TenantManager>,
background_jobs_barrier: completion::Barrier,
) -> anyhow::Result<()> {
let Some(task_config) = &conf.disk_usage_based_eviction else {
@@ -112,8 +116,7 @@ pub fn launch_disk_usage_global_eviction_task(
_ = background_jobs_barrier.wait() => { }
};
disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
.await;
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
Ok(())
},
);
@@ -126,7 +129,7 @@ async fn disk_usage_eviction_task(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
_storage: &GenericRemoteStorage,
tenants_dir: &Utf8Path,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
) {
scopeguard::defer! {
@@ -150,7 +153,8 @@ async fn disk_usage_eviction_task(
async {
let res =
disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
disk_usage_eviction_task_iteration(state, task_config, &tenant_manager, &cancel)
.await;
match res {
Ok(()) => {}
@@ -181,12 +185,14 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
async fn disk_usage_eviction_task_iteration(
state: &State,
task_config: &DiskUsageEvictionTaskConfig,
tenants_dir: &Utf8Path,
tenant_manager: &Arc<TenantManager>,
cancel: &CancellationToken,
) -> anyhow::Result<()> {
let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
let tenants_dir = tenant_manager.get_conf().tenants_path();
let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
.context("get filesystem-level disk usage before evictions")?;
let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
let res =
disk_usage_eviction_task_iteration_impl(state, usage_pre, tenant_manager, cancel).await;
match res {
Ok(outcome) => {
debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -196,7 +202,7 @@ async fn disk_usage_eviction_task_iteration(
}
IterationOutcome::Finished(outcome) => {
// Verify with statvfs whether we made any real progress
let after = filesystem_level_usage::get(tenants_dir, task_config)
let after = filesystem_level_usage::get(&tenants_dir, task_config)
// It's quite unlikely to hit the error here. Keep the code simple and bail out.
.context("get filesystem-level disk usage after evictions")?;
@@ -271,6 +277,7 @@ struct LayerCount {
pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
state: &State,
usage_pre: U,
tenant_manager: &Arc<TenantManager>,
cancel: &CancellationToken,
) -> anyhow::Result<IterationOutcome<U>> {
// use tokio's mutex to get a Sync guard (instead of std::sync::Mutex)
@@ -290,7 +297,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
"running disk usage based eviction due to pressure"
);
let candidates = match collect_eviction_candidates(cancel).await? {
let candidates = match collect_eviction_candidates(tenant_manager, cancel).await? {
EvictionCandidates::Cancelled => {
return Ok(IterationOutcome::Cancelled);
}
@@ -326,7 +333,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// If we get far enough in the list that we start to evict layers that are below
// the tenant's min-resident-size threshold, print a warning, and memorize the disk
// usage at that point, in 'usage_planned_min_resident_size_respecting'.
// Evictions for attached tenants, batched by timeline
let mut batched: HashMap<_, Vec<_>> = HashMap::new();
// Evictions for secondary locations, batched by tenant
let mut secondary_by_tenant: HashMap<TenantId, Vec<(TimelineId, Layer)>> = HashMap::new();
let mut warned = None;
let mut usage_planned = usage_pre;
let mut max_batch_size = 0;
@@ -349,14 +362,22 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
// FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
// tasks to evict all seen layers until we have evicted enough
let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
match candidate.source {
EvictionCandidateSource::Attached(timeline) => {
let batch = batched.entry(TimelineKey(timeline)).or_default();
// semaphore will later be used to limit eviction concurrency, and we can express at
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
// but fail gracefully by not making batches larger.
if batch.len() < u32::MAX as usize {
batch.push(candidate.layer);
max_batch_size = max_batch_size.max(batch.len());
// semaphore will later be used to limit eviction concurrency, and we can express at
// most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
// but fail gracefully by not making batches larger.
if batch.len() < u32::MAX as usize {
batch.push(candidate.layer);
max_batch_size = max_batch_size.max(batch.len());
}
}
EvictionCandidateSource::Secondary(ttid) => {
let batch = secondary_by_tenant.entry(ttid.tenant_id).or_default();
batch.push((ttid.timeline_id, candidate.layer));
}
}
}
@@ -372,7 +393,20 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
};
debug!(?usage_planned, "usage planned");
// phase2: evict victims batched by timeline
// phase2 (secondary tenants): evict victims batched by tenant
for (tenant_id, timeline_layers) in secondary_by_tenant {
// Q: Why do we go via TenantManager again rather than just deleting files, or keeping
// an Arc ref to the secondary state?
// A: It's because a given tenant's local storage **belongs** to whoever is currently
// live in the TenantManager. We must avoid a race where we might plan an eviction
// for secondary, and then execute it when the tenant is actually in an attached state.
tenant_manager
.evict_tenant_layers(&tenant_id, timeline_layers)
.instrument(tracing::info_span!("evict_batch", %tenant_id))
.await;
}
// phase2 (attached tenants): evict victims batched by timeline
let mut js = tokio::task::JoinSet::new();
@@ -480,9 +514,18 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
}))
}
// An eviction candidate might originate from either an attached tenant
// with a [`Tenant`] and [`Timeline`] object, or from a secondary tenant
// location. These differ in how we will execute the eviction.
#[derive(Clone)]
enum EvictionCandidateSource {
Attached(Arc<Timeline>),
Secondary(TenantTimelineId),
}
#[derive(Clone)]
struct EvictionCandidate {
timeline: Arc<Timeline>,
source: EvictionCandidateSource,
layer: Layer,
last_activity_ts: SystemTime,
}
@@ -532,30 +575,25 @@ enum EvictionCandidates {
/// after exhauting the `Above` partition.
/// So, we did not respect each tenant's min_resident_size.
async fn collect_eviction_candidates(
tenant_manager: &Arc<TenantManager>,
cancel: &CancellationToken,
) -> anyhow::Result<EvictionCandidates> {
// get a snapshot of the list of tenants
let tenants = tenant::mgr::list_tenants()
.await
.context("get list of tenants")?;
let mut candidates = Vec::new();
for (tenant_id, _state) in &tenants {
let tenants = tenant_manager.get_attached_tenants();
for tenant in tenants {
if cancel.is_cancelled() {
return Ok(EvictionCandidates::Cancelled);
}
let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
Ok(tenant) => tenant,
Err(e) => {
// this can happen if tenant has lifecycle transition after we fetched it
debug!("failed to get tenant: {e:#}");
continue;
}
};
if tenant.cancel.is_cancelled() {
info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
info!(tenant_id=%tenant.get_tenant_id(), "Skipping tenant for eviction, it is shutting down");
}
if !tenant.is_active() {
debug!(tenant_id=%tenant.get_tenant_id(), "Ignoring non-active tenant for eviction");
continue;
}
@@ -620,7 +658,7 @@ async fn collect_eviction_candidates(
for (timeline, layer_info) in tenant_candidates.into_iter() {
let file_size = layer_info.file_size();
let candidate = EvictionCandidate {
timeline,
source: EvictionCandidateSource::Attached(timeline),
last_activity_ts: layer_info.last_activity_ts,
layer: layer_info.layer,
};
@@ -634,6 +672,60 @@ async fn collect_eviction_candidates(
}
}
// FIXME: this is a long loop over all secondary locations. At the least, respect
// cancellation here, but really we need to break up the loop. We could extract the
// Arc<SecondaryTenant>s and iterate over them with some tokio yields in there. Ideally
// though we should just reduce the total amount of work: our eviction goals do not require
// listing absolutely every layer in every tenant: we could sample this.
let mut secondary_tenants = Vec::new();
tenant_manager.foreach_secondary_tenants(
|tenant_id: &TenantId, state: &Arc<SecondaryTenant>| {
secondary_tenants.push((*tenant_id, state.clone()));
},
);
for (tenant_id, secondary_tenant) in secondary_tenants {
for (timeline_id, layer_info) in secondary_tenant
.get_layers_for_eviction(tenant_manager.get_conf(), tenant_id)
.instrument(tracing::info_span!("get_layers_for_eviction", %tenant_id))
.await
{
let mut tenant_candidates = Vec::new();
debug!(tenant_id=%tenant_id, timeline_id=%timeline_id, "timeline resident layers (secondary) count: {}", layer_info.resident_layers.len());
tenant_candidates.extend(
layer_info
.resident_layers
.into_iter()
.map(|layer_infos| (timeline_id, layer_infos)),
);
tenant_candidates.sort_unstable_by_key(|(_, layer_info)| {
std::cmp::Reverse(layer_info.last_activity_ts)
});
candidates.extend(
tenant_candidates
.into_iter()
.map(|(timeline_id, candidate)| {
(
// Secondary locations' layers are always considered above the min resident size,
// i.e. secondary locations are permitted to be trimmed to zero layers if all
// the layers have sufficiently old access times.
MinResidentSizePartition::Above,
EvictionCandidate {
source: EvictionCandidateSource::Secondary(TenantTimelineId {
tenant_id,
timeline_id,
}),
last_activity_ts: candidate.last_activity_ts,
layer: candidate.layer,
},
)
}),
);
}
}
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
candidates

View File

@@ -1303,6 +1303,8 @@ components:
type: integer
trace_read_requests:
type: boolean
enable_heatmap:
type: boolean
TenantConfigResponse:
type: object
properties:

View File

@@ -4,6 +4,7 @@
use std::collections::HashMap;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, Context, Result};
use futures::TryFutureExt;
@@ -39,6 +40,7 @@ use crate::tenant::mgr::{
GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
TenantSlotError, TenantSlotUpsertError, TenantStateError,
};
use crate::tenant::secondary::SecondaryController;
use crate::tenant::size::ModelInputs;
use crate::tenant::storage_layer::LayerAccessStatsReset;
use crate::tenant::timeline::Timeline;
@@ -71,9 +73,11 @@ pub struct State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
}
impl State {
#[allow(clippy::too_many_arguments)]
pub fn new(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
@@ -82,6 +86,7 @@ impl State {
broker_client: storage_broker::BrokerClientChannel,
disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
deletion_queue_client: DeletionQueueClient,
secondary_controller: SecondaryController,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
.iter()
@@ -96,6 +101,7 @@ impl State {
broker_client,
disk_usage_eviction_state,
deletion_queue_client,
secondary_controller,
})
}
@@ -1139,6 +1145,9 @@ async fn put_tenant_location_config_handler(
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
let tenant_id = request_data.tenant_id;
check_permission(&request, Some(tenant_id))?;
@@ -1168,13 +1177,26 @@ async fn put_tenant_location_config_handler(
state
.tenant_manager
.upsert_location(tenant_id, location_conf, &ctx)
.upsert_location(tenant_id, location_conf, flush, &ctx)
.await
// TODO: badrequest assumes the caller was asking for something unreasonable, but in
// principle we might have hit something like concurrent API calls to the same tenant,
// which is not a 400 but a 409.
.map_err(ApiError::BadRequest)?;
if let Some(_flush_ms) = flush {
match state.secondary_controller.upload_tenant(tenant_id).await {
Ok(()) => {
tracing::info!("Uploaded heatmap during flush");
}
Err(e) => {
tracing::warn!("Failed to flush heatmap: {e}");
}
}
} else {
tracing::info!("No flush requested when configuring");
}
json_response(StatusCode::OK, ())
}
@@ -1478,7 +1500,7 @@ async fn timeline_collect_keyspace(
let keys = timeline
.collect_keyspace(at_lsn, &ctx)
.await
.map_err(ApiError::InternalServerError)?;
.map_err(|e| ApiError::InternalServerError(e.into()))?;
json_response(StatusCode::OK, Partitioning { keys, at_lsn })
}
@@ -1559,11 +1581,12 @@ async fn disk_usage_eviction_run(
)));
}
let state = state.disk_usage_eviction_state.clone();
let eviction_state = state.disk_usage_eviction_state.clone();
let cancel = CancellationToken::new();
let child_cancel = cancel.clone();
let _g = cancel.drop_guard();
let tenant_manager = state.tenant_manager.clone();
crate::task_mgr::spawn(
crate::task_mgr::BACKGROUND_RUNTIME.handle(),
@@ -1574,8 +1597,9 @@ async fn disk_usage_eviction_run(
false,
async move {
let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
&state,
&eviction_state,
usage,
&tenant_manager,
&child_cancel,
)
.await;
@@ -1593,6 +1617,36 @@ async fn disk_usage_eviction_run(
json_response(StatusCode::OK, response)
}
async fn secondary_download_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
state
.secondary_controller
.download_tenant(tenant_id)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn secondary_upload_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let state = get_state(&request);
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
state
.secondary_controller
.upload_tenant(tenant_id)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())
}
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
json_response(
StatusCode::NOT_FOUND,
@@ -1834,6 +1888,12 @@ pub fn make_router(
.put("/v1/deletion_queue/flush", |r| {
api_handler(r, deletion_queue_flush)
})
.post("/v1/secondary/:tenant_id/upload", |r| {
api_handler(r, secondary_upload_handler)
})
.post("/v1/secondary/:tenant_id/download", |r| {
api_handler(r, secondary_download_handler)
})
.put("/v1/tenant/:tenant_id/break", |r| {
testing_api_handler("set tenant state to broken", r, handle_tenant_break)
})

View File

@@ -1043,6 +1043,29 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
}
});
pub(crate) struct SecondaryModeMetrics {
pub(crate) upload_heatmap: IntCounter,
pub(crate) download_heatmap: IntCounter,
pub(crate) download_layer: IntCounter,
}
pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
upload_heatmap: register_int_counter!(
"pageserver_secondary_upload_heatmap",
"Number of heatmaps written to remote storage by attached tenants"
)
.expect("failed to define a metric"),
download_heatmap: register_int_counter!(
"pageserver_secondary_download_heatmap",
"Number of downloads of heatmaps by secondary mode locations"
)
.expect("failed to define a metric"),
download_layer: register_int_counter!(
"pageserver_secondary_download_layer",
"Number of downloads of layers by secondary mode locations"
)
.expect("failed to define a metric"),
});
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum RemoteOpKind {
Upload,

View File

@@ -22,6 +22,7 @@ use std::collections::{hash_map, HashMap, HashSet};
use std::ops::ControlFlow;
use std::ops::Range;
use tracing::{debug, trace, warn};
use utils::bin_ser::DeserializeError;
use utils::{bin_ser::BeSer, lsn::Lsn};
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
@@ -43,6 +44,25 @@ pub enum CalculateLogicalSizeError {
Other(#[from] anyhow::Error),
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum CollectKeySpaceError {
#[error(transparent)]
Decode(#[from] DeserializeError),
#[error(transparent)]
PageRead(PageReconstructError),
#[error("cancelled")]
Cancelled,
}
impl From<PageReconstructError> for CollectKeySpaceError {
fn from(err: PageReconstructError) -> Self {
match err {
PageReconstructError::Cancelled => Self::Cancelled,
err => Self::PageRead(err),
}
}
}
impl From<PageReconstructError> for CalculateLogicalSizeError {
fn from(pre: PageReconstructError) -> Self {
match pre {
@@ -605,11 +625,11 @@ impl Timeline {
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
pub async fn collect_keyspace(
pub(crate) async fn collect_keyspace(
&self,
lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<KeySpace> {
) -> Result<KeySpace, CollectKeySpaceError> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -618,7 +638,7 @@ impl Timeline {
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
let dbdir = DbDirectory::des(&buf)?;
let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
dbs.sort_unstable();
@@ -651,7 +671,7 @@ impl Timeline {
let slrudir_key = slru_dir_to_key(kind);
result.add_key(slrudir_key);
let buf = self.get(slrudir_key, lsn, ctx).await?;
let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
let dir = SlruSegmentDirectory::des(&buf)?;
let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
segments.sort_unstable();
for segno in segments {
@@ -669,7 +689,7 @@ impl Timeline {
// Then pg_twophase
result.add_key(TWOPHASEDIR_KEY);
let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
let twophase_dir = TwoPhaseDirectory::des(&buf)?;
let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
xids.sort_unstable();
for xid in xids {

View File

@@ -257,6 +257,12 @@ pub enum TaskKind {
/// See [`crate::disk_usage_eviction_task`].
DiskUsageEviction,
/// See [`crate::tenant::secondary`].
SecondaryDownloads,
/// See [`crate::tenant::secondary`].
SecondaryUploads,
// Initial logical size calculation
InitialLogicalSizeCalculation,

View File

@@ -66,6 +66,7 @@ use self::timeline::TimelineResources;
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::deletion_queue::DeletionQueueClient;
use crate::deletion_queue::DeletionQueueError;
use crate::import_datadir;
use crate::is_uninit_mark;
use crate::metrics::TENANT_ACTIVATION;
@@ -132,6 +133,7 @@ pub mod storage_layer;
pub mod config;
pub mod delete;
pub mod mgr;
pub mod secondary;
pub mod tasks;
pub mod upload_queue;
@@ -139,7 +141,9 @@ pub(crate) mod timeline;
pub mod size;
pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
pub(crate) use timeline::span::{
debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
};
pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
// re-export for use in remote_timeline_client.rs
@@ -2208,6 +2212,13 @@ impl Tenant {
.or(self.conf.default_tenant_conf.min_resident_size_override)
}
pub fn get_enable_heatmap(&self) -> bool {
let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
tenant_conf
.enable_heatmap
.unwrap_or(self.conf.default_tenant_conf.enable_heatmap)
}
pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
// Don't hold self.timelines.lock() during the notifies.
@@ -3180,6 +3191,30 @@ impl Tenant {
pub fn cached_synthetic_size(&self) -> u64 {
self.cached_synthetic_tenant_size.load(Ordering::Relaxed)
}
/// Flush any in-progress layers, schedule uploads, and wait for uploads to complete.
///
/// This function can take a long time: callers should wrap it in a timeout if calling
/// from an external API handler.
pub async fn flush_remote(&self) -> anyhow::Result<()> {
let timelines = self.timelines.lock().unwrap().clone();
for (timeline_id, timeline) in timelines {
tracing::info!(%timeline_id, "Flushing...");
timeline.freeze_and_flush().await?;
tracing::info!(%timeline_id, "Waiting for uploads...");
if let Some(client) = &timeline.remote_client {
client.wait_completion().await?;
}
}
match self.deletion_queue_client.flush_execute().await {
Ok(_) => {}
Err(DeletionQueueError::ShuttingDown) => {}
}
Ok(())
}
}
fn remove_timeline_and_uninit_mark(
@@ -3472,6 +3507,7 @@ pub(crate) mod harness {
tenant_conf.evictions_low_residence_duration_metric_threshold,
),
gc_feedback: Some(tenant_conf.gc_feedback),
enable_heatmap: Some(tenant_conf.enable_heatmap),
}
}
}

View File

@@ -305,6 +305,11 @@ pub struct TenantConf {
#[serde(with = "humantime_serde")]
pub evictions_low_residence_duration_metric_threshold: Duration,
pub gc_feedback: bool,
/// Whether to upload a heatmap to remote storage, for use by secondary mode.
/// This may be left disabled if a Tenant will only every be attached to
/// one node.
pub enable_heatmap: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -385,6 +390,10 @@ pub struct TenantConfOpt {
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub gc_feedback: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub enable_heatmap: Option<bool>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -453,6 +462,7 @@ impl TenantConfOpt {
.evictions_low_residence_duration_metric_threshold
.unwrap_or(global_conf.evictions_low_residence_duration_metric_threshold),
gc_feedback: self.gc_feedback.unwrap_or(global_conf.gc_feedback),
enable_heatmap: self.enable_heatmap.unwrap_or(global_conf.enable_heatmap),
}
}
}
@@ -490,6 +500,7 @@ impl Default for TenantConf {
)
.expect("cannot parse default evictions_low_residence_duration_metric_threshold"),
gc_feedback: false,
enable_heatmap: false,
}
}
}
@@ -586,6 +597,8 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
}
tenant_conf.gc_feedback = request_data.gc_feedback;
tenant_conf.enable_heatmap = request_data.enable_heatmap;
Ok(tenant_conf)
}
}

View File

@@ -28,7 +28,9 @@ use crate::control_plane_client::{
use crate::deletion_queue::DeletionQueueClient;
use crate::metrics::TENANT_MANAGER as METRICS;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
@@ -39,6 +41,8 @@ use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::secondary::SecondaryTenant;
use super::storage_layer::Layer;
use super::timeline::delete::DeleteTimelineFlow;
use super::TenantSharedResources;
@@ -53,7 +57,7 @@ use super::TenantSharedResources;
/// having a properly acquired generation (Secondary doesn't need a generation)
pub(crate) enum TenantSlot {
Attached(Arc<Tenant>),
Secondary,
Secondary(Arc<SecondaryTenant>),
/// In this state, other administrative operations acting on the TenantId should
/// block, or return a retry indicator equivalent to HTTP 503.
InProgress(utils::completion::Barrier),
@@ -63,7 +67,7 @@ impl std::fmt::Debug for TenantSlot {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
Self::Secondary => write!(f, "Secondary"),
Self::Secondary(_) => write!(f, "Secondary"),
Self::InProgress(_) => write!(f, "InProgress"),
}
}
@@ -74,7 +78,7 @@ impl TenantSlot {
fn get_attached(&self) -> Option<&Arc<Tenant>> {
match self {
Self::Attached(t) => Some(t),
Self::Secondary => None,
Self::Secondary(_) => None,
Self::InProgress(_) => None,
}
}
@@ -427,7 +431,10 @@ pub async fn init_tenant_mgr(
// tenants, because they do no remote writes and hence require no
// generation number
info!(%tenant_id, "Loaded tenant in secondary mode");
tenants.insert(tenant_id, TenantSlot::Secondary);
tenants.insert(
tenant_id,
TenantSlot::Secondary(SecondaryTenant::new(tenant_id)),
);
}
LocationMode::Attached(_) => {
// TODO: augment re-attach API to enable the control plane to
@@ -586,8 +593,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
tenants_to_shut_down.push(t.clone());
shutdown_state.insert(k, TenantSlot::Attached(t));
}
TenantSlot::Secondary => {
shutdown_state.insert(k, TenantSlot::Secondary);
TenantSlot::Secondary(state) => {
// We don't need to wait for this individually per-tenant: the
// downloader task will be waited on eventually, this cancel
// is just to encourage it to drop out if it is doing work
// for this tenant right now.
state.cancel.cancel();
shutdown_state.insert(k, TenantSlot::Secondary(state));
}
TenantSlot::InProgress(notify) => {
// InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -764,11 +777,18 @@ pub(crate) async fn set_new_tenant_config(
}
impl TenantManager {
/// As a convenience to avoid carrying a configuration reference separately, anyone who
/// as a reference to the TenantManager can access configuration this way.
pub(crate) fn get_conf(&self) -> &'static PageServerConf {
self.conf
}
#[instrument(skip_all, fields(%tenant_id))]
pub(crate) async fn upsert_location(
&self,
tenant_id: TenantId,
new_location_config: LocationConf,
flush: Option<Duration>,
ctx: &RequestContext,
) -> Result<(), anyhow::Error> {
info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
@@ -776,7 +796,7 @@ impl TenantManager {
// Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
// then we do not need to set the slot to InProgress, we can just call into the
// existng tenant.
{
let modify_tenant = {
let locked = self.tenants.read().unwrap();
let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
match (&new_location_config.mode, peek_slot) {
@@ -786,22 +806,58 @@ impl TenantManager {
// take our fast path and just provide the updated configuration
// to the tenant.
tenant.set_new_location_config(AttachedTenantConf::try_from(
new_location_config,
new_location_config.clone(),
)?);
// Persist the new config in the background, to avoid holding up any
// locks while we do so.
// TODO
return Ok(());
Some(tenant.clone())
} else {
// Different generations, fall through to general case
None
}
}
_ => {
// Not an Attached->Attached transition, fall through to general case
None
}
}
};
// Fast-path continued: having dropped out of the self.tenants lock, do the async
// phase of waiting for flush, before returning.
if let Some(tenant) = modify_tenant {
// Transition to AttachedStale means we may well hold a valid generation
// still, and have been requested to go stale as part of a migration. If
// the caller set `flush`, then flush to remote storage.
if let LocationMode::Attached(AttachedLocationConfig {
generation: _,
attach_mode: AttachmentMode::Stale,
}) = &new_location_config.mode
{
if let Some(flush_timeout) = flush {
info!(timeout_ms = flush_timeout.as_millis(), "Flushing");
match tokio::time::timeout(
flush_timeout,
tenant.flush_remote().instrument(info_span!("flush_remote")),
)
.await
{
Ok(r) => {
if let Err(e) = r {
tracing::error!("Failed to flush to remote storage: {e}");
return Err(e);
}
}
Err(_) => {
tracing::warn!(
timeout_ms = flush_timeout.as_millis(),
"Timed out waiting for flush to remote storage, proceeding anyway."
)
}
}
}
}
return Ok(());
}
// General case for upserts to TenantsMap, excluding the case above: we will substitute an
@@ -811,55 +867,67 @@ impl TenantManager {
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
// The case where we keep a Tenant alive was covered above in the special case
// for Attached->Attached transitions in the same generation. By this point,
// if we see an attached tenant we know it will be discarded and should be
// shut down.
let (_guard, progress) = utils::completion::channel();
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
self.resources.deletion_queue_client.flush_advisory()
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
}
};
match tenant.get_attach_mode() {
AttachmentMode::Single | AttachmentMode::Multi => {
// Before we leave our state as the presumed holder of the latest generation,
// flush any outstanding deletions to reduce the risk of leaking objects.
self.resources.deletion_queue_client.flush_advisory()
}
AttachmentMode::Stale => {
// If we're stale there's not point trying to flush deletions
}
};
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
info!("Shutting down attached tenant");
match tenant.shutdown(progress, false).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
slot_guard.drop_old_value().expect("We just shut it down");
}
Some(TenantSlot::Secondary(state)) => {
info!("Shutting down secondary tenant");
state.shutdown().await;
}
Some(TenantSlot::InProgress(_)) => {
// This should never happen: acquire_slot should error out
// if the contents of a slot were InProgress.
anyhow::bail!("Acquired an InProgress slot, this is a bug.")
}
None => {
// Slot was vacant, nothing needs shutting down.
}
slot_guard.drop_old_value().expect("We just shut it down");
}
let tenant_path = self.conf.tenant_path(&tenant_id);
let timelines_path = self.conf.timelines_path(&tenant_id);
let new_slot = match &new_location_config.mode {
LocationMode::Secondary(_) => {
let tenant_path = self.conf.tenant_path(&tenant_id);
// Directory doesn't need to be fsync'd because if we crash it can
// safely be recreated next time this tenant location is configured.
unsafe_create_dir_all(&tenant_path)
unsafe_create_dir_all(&timelines_path)
.await
.with_context(|| format!("Creating {tenant_path}"))?;
.with_context(|| format!("Creating {timelines_path}"))?;
Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
.await
.map_err(SetNewTenantConfigError::Persist)?;
TenantSlot::Secondary
TenantSlot::Secondary(SecondaryTenant::new(tenant_id))
}
LocationMode::Attached(_attach_config) => {
let timelines_path = self.conf.timelines_path(&tenant_id);
// Directory doesn't need to be fsync'd because we do not depend on
// it to exist after crashes: it may be recreated when tenant is
// re-attached, see https://github.com/neondatabase/neon/issues/5550
@@ -891,6 +959,97 @@ impl TenantManager {
Ok(())
}
// Do some synchronous work for all tenant slots in Secondary state. The provided
// callback should be small and fast, as it will be called inside the global
// TenantsMap lock.
pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
where
// TODO: let the callback return a hint to drop out of the loop early
F: FnMut(&TenantId, &Arc<SecondaryTenant>),
{
let locked = self.tenants.read().unwrap();
let map = match &*locked {
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
TenantsMap::Open(m) => m,
};
for (tenant_id, slot) in map {
if let TenantSlot::Secondary(state) = slot {
// Only expose secondary tenants that are not currently shutting down
if !state.cancel.is_cancelled() {
func(tenant_id, state)
}
}
}
}
pub(crate) fn get_attached_tenants(&self) -> Vec<Arc<Tenant>> {
let locked = self.tenants.read().unwrap();
let map = match &*locked {
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return Vec::new(),
TenantsMap::Open(m) => m,
};
map.iter()
.filter_map(|(_k, v)| match v {
TenantSlot::Attached(t) => Some(t.clone()),
_ => None,
})
.collect()
}
/// Having planned some evictions for a tenant, attempt to execute them.
///
/// Execution will not occur if the TenantSlot for this tenant is not in
/// a state suitable to execute.
// TODO: is Layer really needed here? Maybe we should have reduced to a LayerFileName by this point.
pub(crate) async fn evict_tenant_layers(
&self,
tenant_id: &TenantId,
timeline_layers: Vec<(TimelineId, Layer)>,
) {
// TODO: unify with how we evict for attached tenants. They should also
// pass through here, to avoid attached tenant evictions racing with
// the lifetime of secondary locations for the same tenant ID.
let state = {
let locked = self.tenants.read().unwrap();
let map = match &*locked {
TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
TenantsMap::Open(m) => m,
};
match map.get(tenant_id) {
Some(TenantSlot::Secondary(secondary_state)) => {
// Found a secondary as expected
secondary_state.clone()
}
_ => {
// A location configuration change raced with this eviction
tracing::info!(
"Dropping {} layer evictions, tenant not in suitable state",
timeline_layers.len()
);
return;
}
}
};
// Concurrency: downloads might have been going on while we deleted layers. However,
// we are only deleting layers that the SecondaryTenant already thought were on disk,
// so we won't be deleting anything that it is _currently_ downloading. All deletions
// of SecondaryTenant layers flow through this function, so there is no risk that the
// layer we're evicting is no longer present in-memory.
state
.evict_layers(self.conf, tenant_id, timeline_layers)
.instrument(tracing::info_span!("evict_layers",
%tenant_id
))
.await;
}
}
#[derive(Debug, thiserror::Error)]
@@ -937,7 +1096,7 @@ pub(crate) fn get_tenant(
}
},
Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
None | Some(TenantSlot::Secondary(_)) => Err(GetTenantError::NotFound(tenant_id)),
}
}
@@ -996,7 +1155,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
_ => WaitFor::Tenant(tenant.clone()),
}
}
Some(TenantSlot::Secondary) => {
Some(TenantSlot::Secondary(_)) => {
return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
tenant_id,
)))
@@ -1303,7 +1462,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
Ok(m.iter()
.filter_map(|(id, tenant)| match tenant {
TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
TenantSlot::Secondary => None,
TenantSlot::Secondary(_) => None,
TenantSlot::InProgress(_) => None,
})
.collect())
@@ -1553,11 +1712,7 @@ impl SlotGuard {
fn old_value_is_shutdown(&self) -> bool {
match self.old_value.as_ref() {
Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
Some(TenantSlot::Secondary) => {
// TODO: when adding secondary mode tenants, this will check for shutdown
// in the same way that we do for `Tenant` above
true
}
Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
Some(TenantSlot::InProgress(_)) => {
// A SlotGuard cannot be constructed for a slot that was already InProgress
unreachable!()
@@ -1761,26 +1916,19 @@ where
let mut slot_guard =
tenant_map_acquire_slot_impl(&tenant_id, tenants, TenantSlotAcquireMode::MustExist)?;
// The SlotGuard allows us to manipulate the Tenant object without fear of some
// concurrent API request doing something else for the same tenant ID.
let attached_tenant = match slot_guard.get_old_value() {
Some(TenantSlot::Attached(t)) => Some(t),
_ => None,
};
// allow pageserver shutdown to await for our completion
let (_guard, progress) = completion::channel();
// If the tenant was attached, shut it down gracefully. For secondary
// locations this part is not necessary
match &attached_tenant {
Some(attached_tenant) => {
// The SlotGuard allows us to manipulate the Tenant object without fear of some
// concurrent API request doing something else for the same tenant ID.
let attached_tenant = match slot_guard.get_old_value() {
Some(TenantSlot::Attached(tenant)) => {
// whenever we remove a tenant from memory, we don't want to flush and wait for upload
let freeze_and_flush = false;
// shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
// that we can continue safely to cleanup.
match attached_tenant.shutdown(progress, freeze_and_flush).await {
match tenant.shutdown(progress, freeze_and_flush).await {
Ok(()) => {}
Err(_other) => {
// if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -1789,11 +1937,19 @@ where
return Err(TenantStateError::IsStopping(tenant_id));
}
}
Some(tenant)
}
None => {
// Nothing to wait on when not attached, proceed.
Some(TenantSlot::Secondary(secondary_state)) => {
tracing::info!("Shutting down in secondary mode");
secondary_state.shutdown().await;
None
}
}
Some(TenantSlot::InProgress(_)) => {
// Acquiring a slot guarantees its old value was not InProgress
unreachable!();
}
None => None,
};
match tenant_cleanup
.await

View File

@@ -180,7 +180,7 @@
//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
//! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map
mod download;
pub(crate) mod download;
pub mod index;
mod upload;
@@ -1501,6 +1501,23 @@ impl RemoteTimelineClient {
}
}
}
pub(crate) fn get_layers_metadata(
&self,
layers: Vec<LayerFileName>,
) -> anyhow::Result<Vec<Option<LayerFileMetadata>>> {
let q = self.upload_queue.lock().unwrap();
let q = match &*q {
UploadQueue::Stopped(_) | UploadQueue::Uninitialized => {
anyhow::bail!("queue is in state {}", q.as_str())
}
UploadQueue::Initialized(inner) => inner,
};
let decorated = layers.into_iter().map(|l| q.latest_files.get(&l).cloned());
Ok(decorated.collect())
}
}
pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
@@ -1541,6 +1558,13 @@ pub fn remote_index_path(
.expect("Failed to construct path")
}
pub const HEATMAP_BASENAME: &str = "heatmap-v1";
pub fn remote_heatmap_path(tenant_id: &TenantId) -> RemotePath {
RemotePath::from_string(&format!("tenants/{tenant_id}/{HEATMAP_BASENAME}"))
.expect("Failed to construct path")
}
/// Given the key of an index, parse out the generation part of the name
pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
let file_name = match path.get_path().file_name() {

View File

@@ -0,0 +1,283 @@
pub mod downloader;
pub mod heatmap;
pub mod heatmap_writer;
use std::{sync::Arc, time::SystemTime};
use crate::{
config::PageServerConf,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
};
use self::{
downloader::{downloader_task, SecondaryDetail},
heatmap_writer::heatmap_writer_task,
};
use super::{
mgr::TenantManager,
storage_layer::{AsLayerDesc, Layer},
timeline::DiskUsageEvictionInfo,
};
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use utils::{
completion::Barrier,
fs_ext,
id::{TenantId, TimelineId},
sync::gate::Gate,
};
enum DownloadCommand {
Download(TenantId),
}
enum UploadCommand {
Upload(TenantId),
}
struct CommandRequest<T> {
payload: T,
response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
}
struct CommandResponse {
result: anyhow::Result<()>,
}
// Whereas [`Tenant`] represents an attached tenant, this type represents the work
// we do for secondary tenant locations: where we are not serving clients or
// ingesting WAL, but we are maintaining a warm cache of layer files.
//
// This type is all about the _download_ path for secondary mode. The upload path
// runs while a regular attached `Tenant` exists.
//
// This structure coordinates TenantManager and SecondaryDownloader,
// so that the downloader can indicate which tenants it is currently
// operating on, and the manager can indicate when a particular
// secondary tenant should cancel any work in flight.
#[derive(Debug)]
pub(crate) struct SecondaryTenant {
/// Cancellation token indicates to SecondaryDownloader that it should stop doing
/// any work for this tenant at the next opportunity.
pub(crate) cancel: CancellationToken,
pub(crate) gate: Gate,
detail: std::sync::Mutex<SecondaryDetail>,
// TODO: propagate the `warm` from LocationConf into here, and respect it when doing downloads
}
impl SecondaryTenant {
pub(crate) fn new(tenant_id: TenantId) -> Arc<Self> {
// TODO; consider whether we really need to Arc this
Arc::new(Self {
// todo: shall we make this a descendent of the
// main cancellation token, or is it sufficient that
// on shutdown we walk the tenants and fire their
// individual cancellations?
cancel: CancellationToken::new(),
gate: Gate::new(format!("SecondaryTenant {tenant_id}")),
detail: std::sync::Mutex::default(),
})
}
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();
// Wait for any secondary downloader work to complete
self.gate.close().await;
}
pub(crate) async fn get_layers_for_eviction(
&self,
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
super::debug_assert_current_span_has_tenant_id();
{
let detail = self.detail.lock().unwrap();
if !detail.is_uninit() {
return detail.get_layers_for_eviction();
} else {
// In case we didn't freshen yet in this process lifetime, we will need to scan local storage
// to find all our layers.
}
}
tracing::debug!("Scanning local layers for secondary tenant to service eviction",);
// Fall through: we need to initialize Detail
let timelines = SecondaryDetail::init_detail(conf, tenant_id).await;
let mut detail = self.detail.lock().unwrap();
if detail.is_uninit() {
detail.timelines = timelines;
}
detail.get_layers_for_eviction()
}
pub(crate) async fn evict_layers(
&self,
conf: &PageServerConf,
tenant_id: &TenantId,
layers: Vec<(TimelineId, Layer)>,
) {
crate::tenant::debug_assert_current_span_has_tenant_id();
let _guard = match self.gate.enter() {
Ok(g) => g,
Err(_) => {
tracing::info!(
"Dropping {} layer evictions, secondary tenant shutting down",
layers.len()
);
return;
}
};
let now = SystemTime::now();
for (timeline_id, layer) in layers {
let layer_name = layer.layer_desc().filename();
let path = conf
.timeline_path(tenant_id, &timeline_id)
.join(&layer_name.file_name());
// We tolerate ENOENT, because between planning eviction and executing
// it, the secondary downloader could have seen an updated heatmap that
// resulted in a layer being deleted.
tokio::fs::remove_file(path)
.await
.or_else(fs_ext::ignore_not_found)
.expect("TODO: terminate process on local I/O errors");
// TODO: batch up updates instead of acquiring lock in inner loop
let mut detail = self.detail.lock().unwrap();
// If there is no timeline detail for what we just deleted, that indicates that
// the secondary downloader did some work (perhaps removing all)
if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
timeline_detail.on_disk_layers.remove(&layer_name);
timeline_detail.evicted_at.insert(layer_name, now);
}
}
}
}
/// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
/// and heatmap uploads. This is not a hot data path: it's primarily a hook for tests,
/// where we want to immediately upload/download for a particular tenant. In normal operation
/// uploads & downloads are autonomous and not driven by this interface.
pub struct SecondaryController {
upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
}
impl SecondaryController {
async fn dispatch<T>(
&self,
queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
payload: T,
) -> anyhow::Result<()> {
let (response_tx, response_rx) = tokio::sync::oneshot::channel();
queue
.send(CommandRequest {
payload,
response_tx,
})
.await
.map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
let response = response_rx
.await
.map_err(|_| anyhow::anyhow!("Request dropped"))?;
response.result
}
pub async fn download_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
self.dispatch(&self.download_req_tx, DownloadCommand::Download(tenant_id))
.await
}
pub async fn upload_tenant(&self, tenant_id: TenantId) -> anyhow::Result<()> {
self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_id))
.await
}
}
pub fn spawn_tasks(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> SecondaryController {
let mgr_clone = tenant_manager.clone();
let storage_clone = remote_storage.clone();
let cancel_clone = cancel.clone();
let bg_jobs_clone = background_jobs_can_start.clone();
let (download_req_tx, download_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryDownloads,
None,
None,
"secondary tenant downloads",
false,
async move {
downloader_task(
conf,
mgr_clone,
storage_clone,
download_req_rx,
bg_jobs_clone,
cancel_clone,
)
.await
},
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryDownloads,
None,
None,
"heatmap uploads",
false,
async move {
heatmap_writer_task(
tenant_manager,
remote_storage,
upload_req_rx,
background_jobs_can_start,
cancel,
)
.await
},
);
SecondaryController {
download_req_tx,
upload_req_tx,
}
}
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
pub fn null_controller() -> SecondaryController {
let (download_req_tx, _download_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
let (upload_req_tx, _upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
SecondaryController {
upload_req_tx,
download_req_tx,
}
}

View File

@@ -0,0 +1,671 @@
use std::{
collections::{HashMap, HashSet},
str::FromStr,
sync::Arc,
time::{Duration, Instant, SystemTime},
};
use crate::{
config::PageServerConf,
metrics::SECONDARY_MODE,
tenant::{
remote_timeline_client::{index::LayerFileMetadata, HEATMAP_BASENAME},
secondary::CommandResponse,
storage_layer::{Layer, LayerFileName},
timeline::{DiskUsageEvictionInfo, LocalLayerInfoForDiskUsageEviction},
},
METADATA_FILE_NAME,
};
use super::SecondaryTenant;
use crate::tenant::{
mgr::TenantManager,
remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
};
use anyhow::Context;
use chrono::format::{DelayedFormat, StrftimeItems};
use remote_storage::GenericRemoteStorage;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{
completion::Barrier,
fs_ext,
id::{TenantId, TimelineId},
sync::gate::GateGuard,
};
use super::{
heatmap::{HeatMapTenant, HeatMapTimeline},
CommandRequest, DownloadCommand,
};
/// Interval between checking if any Secondary tenants have download work to do:
/// note that this is _not_ the frequency with which we actually freshen the tenants,
/// just the frequency with which we wake up to decide whether anyone needs freshening.
///
/// Making this somewhat infrequent reduces the load on mutexes inside TenantManager
/// and SecondaryTenant for reads when checking for work to do.
const DOWNLOAD_CHECK_INTERVAL: Duration = Duration::from_millis(10000);
/// For each tenant, how long must have passed since the last download_tenant call before
/// calling it again. This is approximately the time by which local data is allowed
/// to fall behind remote data.
///
/// TODO: this should be an upper bound, and tenants that are uploading regularly
/// should adaptively freshen more often (e.g. a tenant writing 1 layer per second
/// should not wait a minute between freshens)
const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
#[derive(Debug, Clone)]
pub(super) struct OnDiskState {
layer: Layer,
access_time: SystemTime,
}
impl OnDiskState {
fn new(
conf: &'static PageServerConf,
tenant_id: &TenantId,
timeline_id: &TimelineId,
name: LayerFileName,
metadata: LayerFileMetadata,
access_time: SystemTime,
) -> Self {
Self {
layer: Layer::for_secondary(conf, tenant_id, timeline_id, name, metadata),
access_time,
}
}
}
#[derive(Debug, Clone, Default)]
pub(super) struct SecondaryDetailTimeline {
pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
/// We remember when layers were evicted, to prevent re-downloading them.
/// TODO: persist this, so that we don't try and re-download everything on restart.
pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
}
/// This state is written by the secondary downloader, it is opaque
/// to TenantManager
#[derive(Default, Debug)]
pub(super) struct SecondaryDetail {
freshened_at: Option<Instant>,
pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
}
/// Helper for logging SystemTime
fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
datetime.format("%d/%m/%Y %T")
}
impl SecondaryDetail {
pub(super) fn is_uninit(&self) -> bool {
// FIXME: empty timelines is not synonymous with not initialized, as it is legal for
// a tenant to exist with no timelines.
self.timelines.is_empty()
}
pub(super) async fn init_detail(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> HashMap<TimelineId, SecondaryDetailTimeline> {
tracing::info!("init_detail");
// Load heatmap from local storage
let heatmap_path = conf.tenant_path(&tenant_id).join(HEATMAP_BASENAME);
let heatmap = match tokio::fs::read(heatmap_path).await {
Ok(bytes) => serde_json::from_slice::<HeatMapTenant>(&bytes).unwrap(),
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
return HashMap::new();
} else {
// TODO: use local IO fatal helpers
panic!("Unexpected error");
}
}
};
let mut timelines = HashMap::new();
for heatmap_timeline in heatmap.timelines {
// TODO: use fatal IO helpers
let detail = init_timeline_state(conf, &tenant_id, &heatmap_timeline)
.await
.expect("Failed reading local disk");
timelines.insert(heatmap_timeline.timeline_id, detail);
}
timelines
}
pub(super) fn get_layers_for_eviction(&self) -> Vec<(TimelineId, DiskUsageEvictionInfo)> {
let mut result = Vec::new();
for (timeline_id, timeline_detail) in &self.timelines {
let layers: Vec<_> = timeline_detail
.on_disk_layers
.values()
.map(|ods| LocalLayerInfoForDiskUsageEviction {
layer: ods.layer.clone(),
last_activity_ts: ods.access_time,
})
.collect();
let max_layer_size = layers.iter().map(|l| l.layer.metadata().file_size()).max();
result.push((
*timeline_id,
DiskUsageEvictionInfo {
resident_layers: layers,
max_layer_size,
},
))
}
tracing::debug!(
"Found {} timelines, {} layers",
self.timelines.len(),
result.len()
);
result
}
}
/// Keep trying to do downloads until the cancellation token is fired. Remote storage
/// errors are handled internally: any error returned by this function is an unexpected
/// internal error of some kind.
pub(super) async fn downloader_task(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let downloader = SecondaryDownloader {
conf,
tenant_manager,
remote_storage,
cancel: cancel.clone(),
};
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
downloader.iteration().await?;
tokio::select! {
_ = cancel.cancelled() => {
tracing::info!("Heatmap writer terminating");
break;
},
_ = tokio::time::sleep(DOWNLOAD_CHECK_INTERVAL) => {},
cmd = command_queue.recv() => {
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("Heatmap writer terminating");
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
let result = downloader.handle_command(payload).await;
if response_tx.send(CommandResponse{result}).is_err() {
// Caller went away, e.g. because an HTTP request timed out
tracing::info!("Dropping response to administrative command")
}
}
}
}
Ok(())
}
struct SecondaryDownloader {
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
}
struct TenantJob {
tenant_id: TenantId,
secondary_state: Arc<SecondaryTenant>,
// This mutex guard conveys the right to write to the tenant's local directory: it must
// be taken before doing downloads, and TenantManager must ensure it has been released
// before it considers shutdown complete for the secondary state -- [`SecondaryDownloader`]
// will thereby never be racing with [`Tenant`] for access to local files.
_guard: GateGuard,
}
impl SecondaryDownloader {
async fn iteration(&self) -> anyhow::Result<()> {
// Step 1: identify some tenants that we may work on
let mut candidates: Vec<TenantJob> = Vec::new();
self.tenant_manager
.foreach_secondary_tenants(|tenant_id, secondary_state| {
let guard = match secondary_state.gate.enter() {
Ok(guard) => guard,
// Tenant is shutting down, do no downloads for it
Err(_) => return,
};
candidates.push(TenantJob {
tenant_id: *tenant_id,
secondary_state: secondary_state.clone(),
_guard: guard,
});
});
// Step 2: prioritized selection of next batch of tenants to freshen
let now = Instant::now();
let candidates = candidates.into_iter().filter(|c| {
let detail = c.secondary_state.detail.lock().unwrap();
match detail.freshened_at {
None => true, // Not yet freshened, therefore elegible to run
Some(t) => {
let since = now.duration_since(t);
since > DOWNLOAD_FRESHEN_INTERVAL
}
}
});
// TODO: don't just cut down the list, prioritize it to freshen the stalest tenants first
// TODO: bounded parallelism
// Step 3: spawn download_tenant tasks
for job in candidates {
if job.secondary_state.cancel.is_cancelled() {
continue;
}
async {
if let Err(e) = self
.download_tenant(&job)
.instrument(tracing::info_span!("download_tenant", tenant_id=%job.tenant_id))
.await
{
tracing::info!("Failed to freshen secondary content: {e:#}")
};
// Update freshened_at even if there was an error: we don't want errored tenants to implicitly
// take priority to run again.
let mut detail = job.secondary_state.detail.lock().unwrap();
detail.freshened_at = Some(Instant::now());
}
.instrument(tracing::info_span!(
"download_tenant",
tenant_id = %job.tenant_id
))
.await;
}
Ok(())
}
async fn handle_command(&self, command: DownloadCommand) -> anyhow::Result<()> {
match command {
DownloadCommand::Download(req_tenant_id) => {
let mut candidates: Vec<TenantJob> = Vec::new();
self.tenant_manager
.foreach_secondary_tenants(|tenant_id, secondary_state| {
tracing::info!("foreach_secondary: {tenant_id} ({req_tenant_id})");
if tenant_id == &req_tenant_id {
let guard = match secondary_state.gate.enter() {
Ok(guard) => guard,
// Shutting down
Err(_) => return,
};
candidates.push(TenantJob {
tenant_id: *tenant_id,
secondary_state: secondary_state.clone(),
_guard: guard,
});
}
});
let tenant_job = if candidates.len() != 1 {
anyhow::bail!("Tenant not found in secondary mode");
} else {
candidates.pop().unwrap()
};
self.download_tenant(&tenant_job).await
}
}
}
async fn download_heatmap(&self, tenant_id: &TenantId) -> anyhow::Result<HeatMapTenant> {
// TODO: make download conditional on ETag having changed since last download
let heatmap_path = remote_heatmap_path(tenant_id);
// TODO: wrap this download in a select! that checks self.cancel
let mut download = self.remote_storage.download(&heatmap_path).await?;
let mut heatmap_bytes = Vec::new();
let _size = tokio::io::copy(&mut download.download_stream, &mut heatmap_bytes)
.await
.with_context(|| format!("download heatmap {heatmap_path:?}"))?;
SECONDARY_MODE.download_heatmap.inc();
Ok(serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?)
}
async fn download_timeline(
&self,
job: &TenantJob,
timeline: HeatMapTimeline,
) -> anyhow::Result<()> {
let timeline_path = self
.conf
.timeline_path(&job.tenant_id, &timeline.timeline_id);
// Accumulate updates to the state
let mut touched = Vec::new();
// Clone a view of what layers already exist on disk
let timeline_state = job
.secondary_state
.detail
.lock()
.unwrap()
.timelines
.get(&timeline.timeline_id)
.cloned();
let timeline_state = match timeline_state {
Some(t) => t,
None => {
// We have no existing state: need to scan local disk for layers first.
let timeline_state =
init_timeline_state(self.conf, &job.tenant_id, &timeline).await?;
// Re-acquire detail lock now that we're done with async load from local FS
job.secondary_state
.detail
.lock()
.unwrap()
.timelines
.insert(timeline.timeline_id, timeline_state.clone());
timeline_state
}
};
let layers_in_heatmap = timeline
.layers
.iter()
.map(|l| &l.name)
.collect::<HashSet<_>>();
let layers_on_disk = timeline_state
.on_disk_layers
.iter()
.map(|l| l.0)
.collect::<HashSet<_>>();
// Remove on-disk layers that are no longer present in heatmap
for layer in layers_on_disk.difference(&layers_in_heatmap) {
let local_path = timeline_path.join(layer.to_string());
tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)?;
}
// Download heatmap layers that are not present on local disk, or update their
// access time if they are already present.
for layer in timeline.layers {
if self.cancel.is_cancelled() {
return Ok(());
}
// Existing on-disk layers: just update their access time.
if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
tracing::debug!("Layer {} is already on disk", layer.name);
if on_disk.layer.metadata() != LayerFileMetadata::from(&layer.metadata)
|| on_disk.access_time != layer.access_time
{
// We already have this layer on disk. Update its access time.
tracing::debug!(
"Access time updated for layer {}: {} -> {}",
layer.name,
strftime(&on_disk.access_time),
strftime(&layer.access_time)
);
touched.push(layer);
}
continue;
} else {
tracing::debug!("Layer {} not present on disk yet", layer.name);
}
// Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
// recently than it was evicted.
if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
if &layer.access_time > evicted_at {
tracing::info!(
"Re-downloading evicted layer {}, accessed at {}, evicted at {}",
layer.name,
strftime(&layer.access_time),
strftime(evicted_at)
);
} else {
tracing::trace!(
"Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
layer.name,
strftime(&layer.access_time),
strftime(evicted_at)
);
continue;
}
}
match download_layer_file(
self.conf,
&self.remote_storage,
job.tenant_id,
timeline.timeline_id,
&layer.name,
&LayerFileMetadata::from(&layer.metadata),
)
.await
{
Ok(downloaded_bytes) => {
if downloaded_bytes != layer.metadata.file_size {
let local_path = timeline_path.join(layer.name.to_string());
tracing::error!(
"Downloaded layer {} with unexpected size {} != {}",
layer.name,
downloaded_bytes,
layer.metadata.file_size
);
tokio::fs::remove_file(&local_path)
.await
.or_else(fs_ext::ignore_not_found)?;
}
SECONDARY_MODE.download_layer.inc();
touched.push(layer)
}
Err(e) => {
// No retries here: secondary downloads don't have to succeed: if they fail we just proceed and expect
// that on some future call to freshen the download will work.
// TODO: refine this behavior.
tracing::info!("Failed to download layer {}: {}", layer.name, e);
}
}
}
// Write updates to state to record layers we just downloaded or touched.
{
let mut detail = job.secondary_state.detail.lock().unwrap();
let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
for t in touched {
use std::collections::hash_map::Entry;
match timeline_detail.on_disk_layers.entry(t.name.clone()) {
Entry::Occupied(mut v) => {
v.get_mut().access_time = t.access_time;
}
Entry::Vacant(e) => {
e.insert(OnDiskState::new(
self.conf,
&job.tenant_id,
&timeline.timeline_id,
t.name,
LayerFileMetadata::from(&t.metadata),
t.access_time,
));
}
}
}
}
Ok(())
}
async fn download_tenant(&self, job: &TenantJob) -> anyhow::Result<()> {
tracing::debug!("Downloading heatmap for secondary tenant {}", job.tenant_id);
// Download the tenant's heatmap
let heatmap = self.download_heatmap(&job.tenant_id).await?;
// Save the heatmap: this will be useful on restart, allowing us to reconstruct
// layer metadata without having to re-download it.
let heatmap_path = self.conf.tenant_path(&job.tenant_id).join(HEATMAP_BASENAME);
// TODO: use crashsafe overwrite
// TODO: use die-on-io-error helpers
tokio::fs::write(
&heatmap_path,
&serde_json::to_vec(&heatmap).expect("We just deserialized this"),
)
.await
.unwrap();
tracing::debug!("Wrote heatmap to {}", heatmap_path);
// Download the layers in the heatmap
for timeline in heatmap.timelines {
if self.cancel.is_cancelled() {
return Ok(());
}
let timeline_id = timeline.timeline_id;
self.download_timeline(job, timeline)
.instrument(tracing::info_span!(
"download_timeline",
tenant_id=%job.tenant_id,
%timeline_id
))
.await?;
}
// TODO: remove local disk content for any timelines that don't exist in remote storage
Ok(())
}
}
/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
async fn init_timeline_state(
conf: &'static PageServerConf,
tenant_id: &TenantId,
heatmap: &HeatMapTimeline,
) -> anyhow::Result<SecondaryDetailTimeline> {
let timeline_path = conf.timeline_path(tenant_id, &heatmap.timeline_id);
let mut detail = SecondaryDetailTimeline::default();
let mut dir = match tokio::fs::read_dir(&timeline_path).await {
Ok(d) => d,
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
tracing::info!("Creating timeline directory {timeline_path}");
tokio::fs::create_dir_all(&timeline_path).await?;
// No entries to report: drop out.
return Ok(detail);
} else {
return Err(e.into());
}
}
};
let heatmap_metadata: HashMap<_, _> = heatmap.layers.iter().map(|l| (&l.name, l)).collect();
while let Some(dentry) = dir.next_entry().await? {
let dentry_file_name = dentry.file_name();
let file_name = dentry_file_name.to_string_lossy();
let local_meta = dentry.metadata().await?;
// Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
if file_name == METADATA_FILE_NAME {
continue;
}
match LayerFileName::from_str(&file_name) {
Ok(name) => {
let remote_meta = heatmap_metadata.get(&name);
match remote_meta {
Some(remote_meta) => {
// TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
if local_meta.len() != remote_meta.metadata.file_size {
// This should not happen, because we do crashsafe write-then-rename when downloading
// layers, and layers in remote storage are immutable. Remove the local file because
// we cannot trust it.
tracing::warn!(
"Removing local layer {name} with unexpected local size {} != {}",
local_meta.len(),
remote_meta.metadata.file_size
);
} else {
// We expect the access time to be initialized immediately afterwards, when
// the latest heatmap is applied to the state.
detail.on_disk_layers.insert(
name.clone(),
OnDiskState::new(
conf,
tenant_id,
&heatmap.timeline_id,
name,
LayerFileMetadata::from(&remote_meta.metadata),
remote_meta.access_time,
),
);
}
}
None => {
// FIXME: consider some optimization when transitioning from attached to secondary: maybe
// wait until we have seen a heatmap that is more recent than the most recent on-disk state? Otherwise
// we will end up deleting any layers which were created+uploaded more recently than the heatmap.
tracing::info!(
"Removing secondary local layer {} because it's absent in heatmap",
name
);
tokio::fs::remove_file(dentry.path()).await?;
}
}
}
Err(_) => {
// Ignore it.
tracing::warn!("Unexpected file in timeline directory: {file_name}");
}
}
}
Ok(detail)
}

View File

@@ -0,0 +1,57 @@
use std::time::SystemTime;
use crate::tenant::{
remote_timeline_client::index::IndexLayerMetadata, storage_layer::LayerFileName,
};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use utils::id::TimelineId;
#[derive(Serialize, Deserialize)]
pub(super) struct HeatMapTenant {
pub(super) timelines: Vec<HeatMapTimeline>,
}
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapLayer {
pub(super) name: LayerFileName,
pub(super) metadata: IndexLayerMetadata,
pub(super) access_time: SystemTime,
// TODO: an actual 'heat' score that would let secondary locations prioritize downloading
// the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
}
impl HeatMapLayer {
pub(crate) fn new(
name: LayerFileName,
metadata: IndexLayerMetadata,
access_time: SystemTime,
) -> Self {
Self {
name,
metadata,
access_time,
}
}
}
#[serde_as]
#[derive(Serialize, Deserialize)]
pub(crate) struct HeatMapTimeline {
#[serde_as(as = "DisplayFromStr")]
pub(super) timeline_id: TimelineId,
pub(super) layers: Vec<HeatMapLayer>,
}
impl HeatMapTimeline {
pub(crate) fn new(timeline_id: TimelineId, layers: Vec<HeatMapLayer>) -> Self {
Self {
timeline_id,
layers,
}
}
}

View File

@@ -0,0 +1,380 @@
use std::{
collections::HashMap,
sync::{Arc, Weak},
time::{Duration, Instant},
};
use crate::{
metrics::SECONDARY_MODE,
tenant::{
mgr::{self, TenantManager},
remote_timeline_client::remote_heatmap_path,
secondary::CommandResponse,
Tenant,
},
};
use pageserver_api::models::TenantState;
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use utils::{backoff, completion::Barrier, id::TenantId};
use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
/// Period between heatmap writer walking Tenants to look for work to do
const HEATMAP_WAKE_INTERVAL: Duration = Duration::from_millis(1000);
/// Periodic between heatmap writes for each Tenant
const HEATMAP_UPLOAD_INTERVAL: Duration = Duration::from_millis(60000);
/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
/// we might block waiting on a Tenant.
pub(super) async fn heatmap_writer_task(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> anyhow::Result<()> {
let mut writer = HeatmapWriter {
tenant_manager,
remote_storage,
cancel: cancel.clone(),
tasks: JoinSet::new(),
tenants: HashMap::new(),
tenants_writing: HashMap::new(),
concurrent_writes: 8,
};
tracing::info!("Waiting for background_jobs_can start...");
background_jobs_can_start.wait().await;
tracing::info!("background_jobs_can is ready, proceeding.");
while !cancel.is_cancelled() {
writer.iteration().await?;
tokio::select! {
_ = cancel.cancelled() => {
tracing::info!("Heatmap writer joining tasks");
tracing::info!("Heatmap writer terminating");
break;
},
_ = tokio::time::sleep(HEATMAP_WAKE_INTERVAL) => {},
cmd = command_queue.recv() => {
let cmd = match cmd {
Some(c) =>c,
None => {
// SecondaryController was destroyed, and this has raced with
// our CancellationToken
tracing::info!("Heatmap writer terminating");
break;
}
};
let CommandRequest{
response_tx,
payload
} = cmd;
let result = writer.handle_command(payload).await;
if response_tx.send(CommandResponse{result}).is_err() {
// Caller went away, e.g. because an HTTP request timed out
tracing::info!("Dropping response to administrative command")
}
}
}
}
Ok(())
}
struct WriteInProgress {
barrier: Barrier,
}
struct WriteComplete {
tenant_id: TenantId,
completed_at: Instant,
}
/// The heatmap writer keeps a little bit of per-tenant state, mainly to remember
/// when we last did a write. We only populate this after doing at least one
/// write for a tenant -- this avoids holding state for tenants that have
/// uploads disabled.
struct WriterTenantState {
// This Weak only exists to enable culling IdleTenant instances
// when the Tenant has been deallocated.
tenant: Weak<Tenant>,
last_write: Option<Instant>,
}
struct HeatmapWriter {
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
cancel: CancellationToken,
tenants: HashMap<TenantId, WriterTenantState>,
tenants_writing: HashMap<TenantId, WriteInProgress>,
tasks: JoinSet<WriteComplete>,
concurrent_writes: usize,
}
impl HeatmapWriter {
/// Periodic execution phase: check for new work to do, and run it with `spawn_write`
async fn iteration(&mut self) -> anyhow::Result<()> {
self.drain().await;
// Cull any entries in self.tenants whose Arc<Tenant> is gone
self.tenants.retain(|_k, v| v.tenant.upgrade().is_some());
// Cannot spawn more work right now
if self.tenants_writing.len() >= self.concurrent_writes {
return Ok(());
}
// Iterate over tenants looking for work to do.
let tenants = self.tenant_manager.get_attached_tenants();
for tenant in tenants {
// Can't spawn any more work, drop out
if self.tenants_writing.len() >= self.concurrent_writes {
return Ok(());
}
// Process is shutting down, drop out
if self.cancel.is_cancelled() {
return Ok(());
}
// Skip tenants that don't have heatmaps enabled
if !tenant.get_enable_heatmap() {
continue;
}
// Skip tenants that aren't in a stable active state
if tenant.current_state() != TenantState::Active {
continue;
}
// Skip tenants that already have a write in flight
if self.tenants_writing.contains_key(&tenant.get_tenant_id()) {
continue;
}
// TODO: add a TenantConf for whether to upload at all. This is useful for
// a single-location mode for cheap tenants that don't require HA.
// TODO: add a mechanism to check whether the active layer set has
// changed since our last write
self.maybe_spawn_write(tenant);
}
Ok(())
}
async fn drain(&mut self) {
// Drain any complete background operations
loop {
tokio::select!(
biased;
Some(r) = self.tasks.join_next() => {
match r {
Ok(r) => {
self.on_completion(r);
},
Err(e) => {
// This should not happen, but needn't be fatal.
tracing::error!("Join error on heatmap writer JoinSet! {e}");
}
}
}
else => {
break;
}
)
}
}
fn maybe_spawn_write(&mut self, tenant: Arc<Tenant>) {
// Create an entry in self.tenants if one doesn't already exist: this will later be updated
// with the completion time in on_completion.
let state = self
.tenants
.entry(tenant.get_tenant_id())
.or_insert_with(|| WriterTenantState {
tenant: Arc::downgrade(&tenant),
last_write: None,
});
// Decline to do the upload if insufficient time has passed
if let Some(last_write) = state.last_write {
if Instant::now().duration_since(last_write) < HEATMAP_UPLOAD_INTERVAL {
return;
}
}
self.spawn_write(tenant)
}
fn spawn_write(&mut self, tenant: Arc<Tenant>) {
let remote_storage = self.remote_storage.clone();
let tenant_id = tenant.get_tenant_id();
let (completion, barrier) = utils::completion::channel();
self.tasks.spawn(async move {
// Guard for the barrier in [`WriteInProgress`]
let _completion = completion;
match write_tenant(remote_storage, &tenant)
.instrument(tracing::info_span!(
"write_tenant",
tenant_id = %tenant.get_tenant_id()
))
.await
{
Ok(()) => {}
Err(e) => {
tracing::warn!(
"Failed to upload heatmap for tenant {}: {e:#}",
tenant.get_tenant_id(),
)
}
}
WriteComplete {
tenant_id: tenant.get_tenant_id(),
completed_at: Instant::now(),
}
});
self.tenants_writing
.insert(tenant_id, WriteInProgress { barrier });
}
fn on_completion(&mut self, completion: WriteComplete) {
tracing::debug!(tenant_id=%completion.tenant_id, "Heatmap write task complete");
self.tenants_writing.remove(&completion.tenant_id);
tracing::debug!("Task completed for tenant {}", completion.tenant_id);
use std::collections::hash_map::Entry;
match self.tenants.entry(completion.tenant_id) {
Entry::Vacant(_) => {
// Tenant state was dropped, nothing to update.
}
Entry::Occupied(mut entry) => {
entry.get_mut().last_write = Some(completion.completed_at)
}
}
}
async fn handle_command(&mut self, command: UploadCommand) -> anyhow::Result<()> {
match command {
UploadCommand::Upload(tenant_id) => {
// If an upload was ongoing for this tenant, let it finish first.
if let Some(writing_state) = self.tenants_writing.get(&tenant_id) {
tracing::info!(%tenant_id, "Waiting for heatmap write to complete");
writing_state.barrier.clone().wait().await;
}
// Spawn the upload then immediately wait for it. This will block processing of other commands and
// starting of other background work.
tracing::info!(%tenant_id, "Starting heatmap write on command");
let tenant = mgr::get_tenant(tenant_id, true)?;
self.spawn_write(tenant);
let writing_state = self
.tenants_writing
.get(&tenant_id)
.expect("We just inserted this");
tracing::info!(%tenant_id, "Waiting for heatmap write to complete");
writing_state.barrier.clone().wait().await;
tracing::info!(%tenant_id, "Heatmap write complete");
// This drain is not necessary for correctness, but it is polite to avoid intentionally leaving
// our complete task in self.tenants_writing.
self.drain().await;
Ok(())
}
}
}
}
async fn write_tenant(
remote_storage: GenericRemoteStorage,
tenant: &Arc<Tenant>,
) -> anyhow::Result<()> {
let mut heatmap = HeatMapTenant {
timelines: Vec::new(),
};
let timelines = tenant.timelines.lock().unwrap().clone();
let tenant_cancel = tenant.cancel.clone();
// Ensure that Tenant::shutdown waits for any upload in flight: this is needed because otherwise
// when we delete a tenant, we might race with an upload in flight and end up leaving a heatmap behind
// in remote storage.
let _guard = match tenant.gate.enter() {
Ok(g) => g,
Err(_) => {
tracing::info!("Skipping heatmap upload for tenant which is shutting down");
return Ok(());
}
};
for (timeline_id, timeline) in timelines {
let heatmap_timeline = timeline.generate_heatmap().await;
match heatmap_timeline {
None => {
tracing::debug!(
"Skipping heatmap upload because timeline {timeline_id} is not ready"
);
return Ok(());
}
Some(heatmap_timeline) => {
heatmap.timelines.push(heatmap_timeline);
}
}
}
// Serialize the heatmap
let bytes = serde_json::to_vec(&heatmap)?;
let size = bytes.len();
let path = remote_heatmap_path(&tenant.get_tenant_id());
// Write the heatmap.
tracing::debug!("Uploading {size} byte heatmap to {path}");
if let Err(e) = backoff::retry(
|| async {
let bytes = tokio::io::BufReader::new(std::io::Cursor::new(bytes.clone()));
let bytes = Box::new(bytes);
remote_storage
.upload_storage_object(bytes, size, &path)
.await
},
|_| false,
3,
u32::MAX,
"Uploading heatmap",
backoff::Cancel::new(tenant_cancel.clone(), || anyhow::anyhow!("Shutting down")),
)
.await
{
if tenant_cancel.is_cancelled() {
return Ok(());
} else {
return Err(e);
}
}
SECONDARY_MODE.upload_heatmap.inc();
tracing::info!("Successfully uploaded {size} byte heatmap to {path}");
Ok(())
}

View File

@@ -8,6 +8,7 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Weak};
use std::time::SystemTime;
use tracing::Instrument;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::sync::heavier_once_cell;
@@ -91,7 +92,7 @@ impl Layer {
let owner = Layer(Arc::new(LayerInner::new(
conf,
timeline,
Arc::downgrade(timeline),
access_stats,
desc,
None,
@@ -103,6 +104,33 @@ impl Layer {
owner
}
/// A layer which is resident locally in a secondary location: not associated with a live Timeline object.
pub(crate) fn for_secondary(
conf: &'static PageServerConf,
tenant_id: &TenantId,
timeline_id: &TimelineId,
file_name: LayerFileName,
metadata: LayerFileMetadata,
) -> Self {
let desc = PersistentLayerDesc::from_filename(
*tenant_id,
*timeline_id,
file_name,
metadata.file_size(),
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
Layer(Arc::new(LayerInner::new(
conf,
Weak::default(),
access_stats,
desc,
None,
metadata.generation,
)))
}
/// Creates a Layer value for a file we know to be resident in timeline directory.
pub(crate) fn for_resident(
conf: &'static PageServerConf,
@@ -131,7 +159,7 @@ impl Layer {
LayerInner::new(
conf,
timeline,
Arc::downgrade(timeline),
access_stats,
desc,
Some(inner),
@@ -174,7 +202,7 @@ impl Layer {
);
LayerInner::new(
conf,
timeline,
Arc::downgrade(timeline),
access_stats,
desc,
Some(inner),
@@ -517,14 +545,14 @@ impl Drop for LayerInner {
impl LayerInner {
fn new(
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
timeline: Weak<Timeline>,
access_stats: LayerAccessStats,
desc: PersistentLayerDesc,
downloaded: Option<Arc<DownloadedLayer>>,
generation: Generation,
) -> Self {
let path = conf
.timeline_path(&timeline.tenant_id, &timeline.timeline_id)
.timeline_path(&desc.tenant_id, &desc.timeline_id)
.join(desc.filename().to_string());
let (inner, version) = if let Some(inner) = downloaded {
@@ -535,12 +563,17 @@ impl LayerInner {
(heavier_once_cell::OnceCell::default(), 0)
};
let have_remote_client = timeline
.upgrade()
.map(|t| t.remote_client.is_some())
.unwrap_or(false);
LayerInner {
conf,
path,
desc,
timeline: Arc::downgrade(timeline),
have_remote_client: timeline.remote_client.is_some(),
timeline,
have_remote_client,
access_stats,
wanted_garbage_collected: AtomicBool::new(false),
wanted_evicted: AtomicBool::new(false),

View File

@@ -88,8 +88,9 @@ use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::remote_timeline_client::index::IndexPart;
use super::remote_timeline_client::index::{IndexLayerMetadata, IndexPart};
use super::remote_timeline_client::RemoteTimelineClient;
use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
@@ -2015,6 +2016,55 @@ impl Timeline {
None
}
/// The timeline heatmap is a hint to secondary locations from the primary location,
/// indicating which layers are currently on-disk on the primary.
///
/// None is returned if the Timeline is in a state where uploading a heatmap
/// doesn't make sense, such as shutting down or initializing. The caller
/// should treat this as a cue to simply skip doing any heatmap uploading
/// for this timeline.
pub(crate) async fn generate_heatmap(&self) -> Option<HeatMapTimeline> {
let eviction_info = self.get_local_layers_for_disk_usage_eviction().await;
let remote_client = match &self.remote_client {
Some(c) => c,
None => return None,
};
let layer_file_names = eviction_info
.resident_layers
.iter()
.map(|l| l.layer.layer_desc().filename())
.collect::<Vec<_>>();
let decorated = match remote_client.get_layers_metadata(layer_file_names) {
Ok(d) => d,
Err(_) => {
// Getting metadata only fails on Timeline in bad state.
return None;
}
};
let heatmap_layers = std::iter::zip(
eviction_info.resident_layers.into_iter(),
decorated.into_iter(),
)
.filter_map(|(layer, remote_info)| {
remote_info.map(|remote_info| {
HeatMapLayer::new(
layer.layer.layer_desc().filename(),
IndexLayerMetadata::from(remote_info),
layer.last_activity_ts,
)
})
});
Some(HeatMapTimeline::new(
self.timeline_id,
heatmap_layers.collect(),
))
}
}
type TraversalId = String;

View File

@@ -26,6 +26,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
use crate::{
context::{DownloadBehavior, RequestContext},
pgdatadir_mapping::CollectKeySpaceError,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -397,9 +398,16 @@ impl Timeline {
if size.is_err() {
// ignore, see above comment
} else {
warn!(
"failed to collect keyspace but succeeded in calculating logical size: {e:#}"
);
match e {
CollectKeySpaceError::Cancelled => {
// Shutting down, ignore
}
err => {
warn!(
"failed to collect keyspace but succeeded in calculating logical size: {err:#}"
);
}
}
}
}
}

View File

@@ -18,3 +18,15 @@ pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
panic!("missing extractors: {missing:?}")
}
}
#[cfg(not(debug_assertions))]
pub(crate) fn debug_assert_current_span_has_tenant_id() {}
#[cfg(debug_assertions)]
#[track_caller]
pub(crate) fn debug_assert_current_span_has_tenant_id() {
let fields: [&dyn Extractor; 1] = [&*crate::tenant::span::TENANT_ID_EXTRACTOR];
if let Err(missing) = check_fields_present!(fields) {
panic!("missing extractors: {missing:?}")
}
}

View File

@@ -28,6 +28,7 @@ import jwt
import psycopg2
import pytest
import requests
import toml
from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.fixtures import FixtureRequest
@@ -1778,7 +1779,7 @@ class NeonPageserver(PgProtocol):
@property
def workdir(self) -> Path:
return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}"))
return self.env.repo_dir / f"pageserver_{self.id}"
def assert_no_errors(self):
logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
@@ -1859,6 +1860,27 @@ class NeonPageserver(PgProtocol):
client = self.http_client()
return client.tenant_attach(tenant_id, config, config_null, generation=generation)
def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs):
# This API is only for use when generations are enabled
assert self.env.attachment_service is not None
if config["mode"].startswith("Attached") and "generation" not in config:
config["generation"] = self.env.attachment_service.attach_hook(tenant_id, self.id)
client = self.http_client()
return client.tenant_location_conf(tenant_id, config, **kwargs)
def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]:
path = self.tenant_dir(tenant_id) / "config-v1"
log.info(f"Reading location conf from {path}")
bytes = open(path, "r").read()
try:
decoded: dict[str, Any] = toml.loads(bytes)
return decoded
except:
log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}")
raise
def append_pageserver_param_overrides(
params_to_update: List[str],
@@ -1871,6 +1893,8 @@ def append_pageserver_param_overrides(
params_to_update.append(
f"--pageserver-config-override=remote_storage={remote_storage_toml_table}"
)
else:
params_to_update.append('--pageserver-config-override=remote_storage=""')
env_overrides = os.getenv("NEON_PAGESERVER_OVERRIDES")
if env_overrides is not None:
@@ -2719,6 +2743,7 @@ class EndpointFactory:
lsn: Optional[Lsn] = None,
hot_standby: bool = False,
config_lines: Optional[List[str]] = None,
pageserver_id: Optional[int] = None,
) -> Endpoint:
ep = Endpoint(
self.env,
@@ -2738,6 +2763,7 @@ class EndpointFactory:
lsn=lsn,
hot_standby=hot_standby,
config_lines=config_lines,
pageserver_id=pageserver_id,
)
def stop_all(self) -> "EndpointFactory":
@@ -3079,7 +3105,7 @@ def pytest_addoption(parser: Parser):
SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
r"config|metadata|.+\.(?:toml|pid|json|sql)"
r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)"
)

View File

@@ -141,7 +141,7 @@ class PageserverHttpClient(requests.Session):
# (this may change in future if we do fault injection of a kind that causes
# requests TCP flows to stick)
read=False,
backoff_factor=0,
backoff_factor=0.2,
status_forcelist=[503],
allowed_methods=None,
remove_headers_on_redirect=[],
@@ -251,6 +251,23 @@ class PageserverHttpClient(requests.Session):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
self.verbose_error(res)
def tenant_location_conf(
self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None
):
body = location_conf.copy()
body["tenant_id"] = str(tenant_id)
params = {}
if flush_ms is not None:
params["flush_ms"] = str(flush_ms)
res = self.put(
f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config",
json=body,
params=params,
)
self.verbose_error(res)
def tenant_delete(self, tenant_id: TenantId):
res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
self.verbose_error(res)
@@ -654,6 +671,14 @@ class PageserverHttpClient(requests.Session):
res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break")
self.verbose_error(res)
def secondary_tenant_upload(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/secondary/{tenant_id}/upload")
self.verbose_error(res)
def secondary_tenant_download(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/secondary/{tenant_id}/download")
self.verbose_error(res)
def post_tracing_event(self, level: str, message: str):
res = self.post(
f"http://localhost:{self.port}/v1/tracing/event",

View File

@@ -47,7 +47,7 @@ def wait_for_upload(
for i in range(20):
current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
if current_lsn >= lsn:
log.info("wait finished")
log.info(f"wait finished: current remote consistent lsn {current_lsn}")
return
lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
log.info(

View File

@@ -0,0 +1,148 @@
from typing import Optional
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.types import TenantId, TimelineId
class Workload:
"""
This is not a general purpose load generator: it exists for storage tests that need to inject some
high level types of storage work via the postgres interface:
- layer writes (`write_rows`)
- work for compaction (`churn_rows`)
- reads, checking we get the right data (`validate`)
"""
def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId):
self.env = env
self.tenant_id = tenant_id
self.timeline_id = timeline_id
self.table = "foo"
self.expect_rows = 0
self.churn_cursor = 0
self._endpoint: Optional[Endpoint] = None
def endpoint(self, pageserver_id: int) -> Endpoint:
if self._endpoint is None:
self._endpoint = self.env.endpoints.create(
"main",
tenant_id=self.tenant_id,
pageserver_id=pageserver_id,
endpoint_id="ep-workload",
)
self._endpoint.start(pageserver_id=pageserver_id)
else:
self._endpoint.reconfigure(pageserver_id=pageserver_id)
connstring = self._endpoint.safe_psql(
"SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'"
)
log.info(f"Workload.endpoint: connstr={connstring}")
return self._endpoint
def __del__(self):
if self._endpoint is not None:
self._endpoint.stop()
def init(self, pageserver_id: int):
endpoint = self.endpoint(pageserver_id)
endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);")
endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;")
last_flush_lsn_upload(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
def write_rows(self, n, pageserver_id):
endpoint = self.endpoint(pageserver_id)
start = self.expect_rows
end = start + n - 1
self.expect_rows += n
dummy_value = "blah"
endpoint.safe_psql(
f"""
INSERT INTO {self.table} (id, val)
SELECT g, '{dummy_value}'
FROM generate_series({start}, {end}) g
"""
)
return last_flush_lsn_upload(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
def churn_rows(self, n, pageserver_id, upload=True):
assert self.expect_rows >= n
max_iters = 10
endpoint = self.endpoint(pageserver_id)
todo = n
i = 0
while todo > 0:
i += 1
if i > max_iters:
raise RuntimeError("oops")
start = self.churn_cursor % self.expect_rows
n_iter = min((self.expect_rows - start), todo)
todo -= n_iter
end = start + n_iter - 1
log.info(
f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}"
)
assert end < self.expect_rows
self.churn_cursor += n_iter
dummy_value = "blah"
endpoint.safe_psql_many(
[
f"""
INSERT INTO {self.table} (id, val)
SELECT g, '{dummy_value}'
FROM generate_series({start}, {end}) g
ON CONFLICT (id) DO UPDATE
SET val = EXCLUDED.val
""",
f"VACUUM {self.table}",
]
)
last_flush_lsn = wait_for_last_flush_lsn(
self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id
)
ps_http = self.env.get_pageserver(pageserver_id).http_client()
wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
if upload:
# force a checkpoint to trigger upload
ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id)
wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn)
log.info(f"Churn: waiting for remote LSN {last_flush_lsn}")
else:
log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}")
def validate(self, pageserver_id):
endpoint = self.endpoint(pageserver_id)
result = endpoint.safe_psql_many(
[
"select clear_buffer_cache()",
f"""
SELECT COUNT(*) FROM {self.table}
""",
]
)
log.info(f"validate({self.expect_rows}): {result}")
assert result == [[("",)], [(self.expect_rows,)]]

View File

@@ -8,6 +8,7 @@ from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
NeonPageserver,
PgBin,
wait_for_last_flush_lsn,
)
@@ -73,14 +74,21 @@ class EvictionEnv:
layer_size: int
pgbench_init_lsns: Dict[TenantId, Lsn]
def timelines_du(self) -> Tuple[int, int, int]:
@property
def pageserver(self):
"""
Shortcut for tests that only use one pageserver.
"""
return self.neon_env.pageserver
def timelines_du(self, pageserver: NeonPageserver) -> Tuple[int, int, int]:
return poor_mans_du(
self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], verbose=False
self.neon_env, [(tid, tlid) for tid, tlid in self.timelines], pageserver, verbose=False
)
def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]:
def du_by_timeline(self, pageserver: NeonPageserver) -> Dict[Tuple[TenantId, TimelineId], int]:
return {
(tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], verbose=True)[0]
(tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)], pageserver, verbose=True)[0]
for tid, tlid in self.timelines
}
@@ -108,7 +116,7 @@ class EvictionEnv:
_avg = cur.fetchone()
def pageserver_start_with_disk_usage_eviction(
self, period, max_usage_pct, min_avail_bytes, mock_behavior
self, pageserver: NeonPageserver, period, max_usage_pct, min_avail_bytes, mock_behavior
):
disk_usage_config = {
"period": period,
@@ -119,7 +127,12 @@ class EvictionEnv:
enc = toml.TomlEncoder()
self.neon_env.pageserver.start(
# these can sometimes happen during startup before any tenants have been
# loaded, so nothing can be evicted, we just wait for next iteration which
# is able to evict.
pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
pageserver.start(
overrides=(
"--pageserver-config-override=disk_usage_based_eviction="
+ enc.dump_inline_table(disk_usage_config).replace("\n", " "),
@@ -133,15 +146,10 @@ class EvictionEnv:
)
def statvfs_called():
assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*")
assert pageserver.log_contains(".*running mocked statvfs.*")
wait_until(10, 1, statvfs_called)
# these can sometimes happen during startup before any tenants have been
# loaded, so nothing can be evicted, we just wait for next iteration which
# is able to evict.
self.neon_env.pageserver.allowed_errors.append(".*WARN.* disk usage still high.*")
def human_bytes(amt: float) -> str:
suffixes = ["", "Ki", "Mi", "Gi"]
@@ -158,21 +166,42 @@ def human_bytes(amt: float) -> str:
@pytest.fixture
def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=1)
@pytest.fixture
def eviction_env_ha(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv:
"""
Variant of the eviction environment with two pageservers for testing eviction on
HA configurations with a secondary location.
"""
return _eviction_env(request, neon_env_builder, pg_bin, num_pageservers=2)
def _eviction_env(
request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, num_pageservers: int
) -> EvictionEnv:
"""
Creates two tenants, one somewhat larger than the other.
"""
log.info(f"setting up eviction_env for test {request.node.name}")
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
if num_pageservers >= 2:
neon_env_builder.enable_generations = True
# initial tenant will not be present on this pageserver
env = neon_env_builder.init_configs()
env.start()
pageserver_http = env.pageserver.http_client()
# We will create all tenants on the 0th pageserver
pageserver_http = env.pageservers[0].http_client()
# allow because we are invoking this manually; we always warn on executing disk based eviction
env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
for pageserver in env.pageservers:
pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
# Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
# Large count of layers and small layer size is good for testing because it makes evictions predictable.
@@ -197,7 +226,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
pg_bin.run(["pgbench", "-i", f"-s{scale}", endpoint.connstr()])
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id, pageserver_id=1)
timelines.append((tenant_id, timeline_id))
@@ -245,10 +274,10 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
healthy_tenant_id, healthy_timeline_id = env.timelines[1]
broken_size_pre, _, _ = poor_mans_du(
env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
env.neon_env, [(broken_tenant_id, broken_timeline_id)], env.pageserver, verbose=True
)
healthy_size_pre, _, _ = poor_mans_du(
env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], env.pageserver, verbose=True
)
# try to evict everything, then validate that broken tenant wasn't touched
@@ -258,10 +287,10 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
log.info(f"{response}")
broken_size_post, _, _ = poor_mans_du(
env.neon_env, [(broken_tenant_id, broken_timeline_id)], verbose=True
env.neon_env, [(broken_tenant_id, broken_timeline_id)], env.pageserver, verbose=True
)
healthy_size_post, _, _ = poor_mans_du(
env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], verbose=True
env.neon_env, [(healthy_tenant_id, healthy_timeline_id)], env.pageserver, verbose=True
)
assert broken_size_pre == broken_size_post, "broken tenant should not be touched"
@@ -277,14 +306,14 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv)
env = eviction_env
pageserver_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
(total_on_disk, _, _) = env.timelines_du(env.pageserver)
target = total_on_disk // 2
response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
(later_total_on_disk, _, _) = env.timelines_du(env.pageserver)
actual_change = total_on_disk - later_total_on_disk
@@ -303,8 +332,8 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
env = eviction_env
ps_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
du_by_timeline = env.du_by_timeline()
(total_on_disk, _, _) = env.timelines_du(env.pageserver)
du_by_timeline = env.du_by_timeline(env.pageserver)
log.info("du_by_timeline: %s", du_by_timeline)
assert len(du_by_timeline) == 2, "this test assumes two tenants"
@@ -344,8 +373,8 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv)
GLOBAL_LRU_LOG_LINE,
), "this test is pointless if it fell back to global LRU"
(later_total_on_disk, _, _) = env.timelines_du()
later_du_by_timeline = env.du_by_timeline()
(later_total_on_disk, _, _) = env.timelines_du(env.pageserver)
later_du_by_timeline = env.du_by_timeline(env.pageserver)
log.info("later_du_by_timeline: %s", later_du_by_timeline)
actual_change = total_on_disk - later_total_on_disk
@@ -373,13 +402,13 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
env = eviction_env
ps_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
(total_on_disk, _, _) = env.timelines_du(env.pageserver)
target = total_on_disk
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
(later_total_on_disk, _, _) = env.timelines_du(env.pageserver)
actual_change = total_on_disk - later_total_on_disk
assert 0 <= actual_change, "nothing can load layers during this test"
assert actual_change >= target, "eviction must always evict more than target"
@@ -399,8 +428,8 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
env = eviction_env
ps_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
du_by_timeline = env.du_by_timeline()
(total_on_disk, _, _) = env.timelines_du(env.pageserver)
du_by_timeline = env.du_by_timeline(env.pageserver)
# pick any tenant
[warm, cold] = list(du_by_timeline.keys())
@@ -416,12 +445,12 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
response = ps_http.disk_usage_eviction_run({"evict_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
(later_total_on_disk, _, _) = env.timelines_du(env.pageserver)
actual_change = total_on_disk - later_total_on_disk
assert 0 <= actual_change, "nothing can load layers during this test"
assert actual_change >= target, "eviction must always evict more than target"
later_du_by_timeline = env.du_by_timeline()
later_du_by_timeline = env.du_by_timeline(env.pageserver)
for tenant, later_tenant_usage in later_du_by_timeline.items():
assert (
later_tenant_usage < du_by_timeline[tenant]
@@ -453,7 +482,10 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv):
def poor_mans_du(
env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]], verbose: bool = False
env: NeonEnv,
timelines: list[Tuple[TenantId, TimelineId]],
pageserver: NeonPageserver,
verbose: bool = False,
) -> Tuple[int, int, int]:
"""
Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
@@ -463,7 +495,7 @@ def poor_mans_du(
largest_layer = 0
smallest_layer = None
for tenant_id, timeline_id in timelines:
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
assert timeline_dir.exists(), f"timeline dir does not exist: {timeline_dir}"
total = 0
for file in timeline_dir.iterdir():
@@ -494,6 +526,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
env = eviction_env
env.neon_env.pageserver.stop()
env.pageserver_start_with_disk_usage_eviction(
env.pageserver,
period="1s",
max_usage_pct=90,
min_avail_bytes=0,
@@ -517,11 +550,12 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
env.neon_env.pageserver.stop()
# make it seem like we're at 100% utilization by setting total bytes to the used bytes
total_size, _, _ = env.timelines_du()
total_size, _, _ = env.timelines_du(env.pageserver)
blocksize = 512
total_blocks = (total_size + (blocksize - 1)) // blocksize
env.pageserver_start_with_disk_usage_eviction(
env.pageserver,
period="1s",
max_usage_pct=33,
min_avail_bytes=0,
@@ -540,7 +574,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
wait_until(10, 1, relieved_log_message)
post_eviction_total_size, _, _ = env.timelines_du()
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
@@ -555,13 +589,14 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
env.neon_env.pageserver.stop()
# make it seem like we're at 100% utilization by setting total bytes to the used bytes
total_size, _, _ = env.timelines_du()
total_size, _, _ = env.timelines_du(env.pageserver)
blocksize = 512
total_blocks = (total_size + (blocksize - 1)) // blocksize
min_avail_bytes = total_size // 3
env.pageserver_start_with_disk_usage_eviction(
env.pageserver,
period="1s",
max_usage_pct=100,
min_avail_bytes=min_avail_bytes,
@@ -580,7 +615,66 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
wait_until(10, 1, relieved_log_message)
post_eviction_total_size, _, _ = env.timelines_du()
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
assert (
total_size - post_eviction_total_size >= min_avail_bytes
), "we requested at least min_avail_bytes worth of free space"
def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
env = eviction_env_ha
tenant_ids = [t[0] for t in env.timelines]
log.info("Setting up secondary location...")
ps_attached = env.neon_env.pageservers[0]
ps_secondary = env.neon_env.pageservers[1]
for tenant_id in tenant_ids:
ps_secondary.tenant_location_configure(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
},
)
readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
log.info(f"Read back conf: {readback_conf}")
# Request secondary location to download all layers that the attached location has
ps_attached.http_client().secondary_tenant_upload(tenant_id)
ps_secondary.http_client().secondary_tenant_download(tenant_id)
# Configure the secondary pageserver to have a phony small disk size
ps_secondary.stop()
total_size, _, _ = env.timelines_du(ps_secondary)
blocksize = 512
total_blocks = (total_size + (blocksize - 1)) // blocksize
min_avail_bytes = total_size // 3
env.pageserver_start_with_disk_usage_eviction(
ps_secondary,
period="1s",
max_usage_pct=100,
min_avail_bytes=min_avail_bytes,
mock_behavior={
"type": "Success",
"blocksize": blocksize,
"total_blocks": total_blocks,
# Only count layer files towards used bytes in the mock_statvfs.
# This avoids accounting for metadata files & tenant conf in the tests.
"name_filter": ".*__.*",
},
)
def relieved_log_message():
assert ps_secondary.log_contains(".*disk usage pressure relieved")
wait_until(10, 1, relieved_log_message)
post_eviction_total_size, _, _ = env.timelines_du(ps_secondary)
assert (
total_size - post_eviction_total_size >= min_avail_bytes

View File

@@ -25,12 +25,19 @@ from fixtures.neon_fixtures import (
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.utils import list_prefix
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
assert_tenant_state,
list_prefix,
wait_for_last_record_lsn,
wait_for_upload,
)
from fixtures.remote_storage import (
RemoteStorageKind,
)
from fixtures.types import TenantId, TimelineId
from fixtures.utils import print_gc_result, wait_until
from fixtures.workload import Workload
# A tenant configuration that is convenient for generating uploads and deletions
# without a large amount of postgres traffic.
@@ -560,3 +567,91 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder):
read_all(env, tenant_id, timeline_id)
evict_all_layers(env, tenant_id, timeline_id)
read_all(env, tenant_id, timeline_id)
def test_multi_attach(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 3
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
pageservers = env.pageservers
http_clients = list([p.http_client() for p in pageservers])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# We will intentionally create situations where stale deletions happen from non-latest-generation
# nodes when the tenant is multiply-attached
for ps in env.pageservers:
ps.allowed_errors.extend(
[".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
)
# Initially, the tenant will be attached to the pageserver a (first is default in our test harness)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
with pytest.raises(PageserverApiException):
http_clients[1].timeline_detail(tenant_id, timeline_id)
with pytest.raises(PageserverApiException):
http_clients[2].timeline_detail(tenant_id, timeline_id)
workload = Workload(env, tenant_id, timeline_id)
workload.init(pageservers[0].id)
workload.write_rows(1000, pageservers[0].id)
# Attach the tenant to the other two pageservers
pageservers[1].tenant_attach(env.initial_tenant)
pageservers[2].tenant_attach(env.initial_tenant)
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
# Now they all have it attached
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
_detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
_detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
# The endpoint can use any pageserver to service its reads
for pageserver in pageservers:
workload.validate(pageserver.id)
# If we write some more data, all the nodes can see it, including stale ones
wrote_lsn = workload.write_rows(1000, pageservers[0].id)
for ps_http in http_clients:
wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn)
# ...and indeed endpoints can see it via any of the pageservers
for pageserver in pageservers:
workload.validate(pageserver.id)
# Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage
for ps_http in http_clients:
ps_http.timeline_checkpoint(tenant_id, timeline_id)
wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn)
# Now, the contents of remote storage will be a set of layers from each pageserver, but with unique
# generation numbers
# TODO: validate remote storage contents
# Stop all pageservers
for ps in pageservers:
ps.stop()
# Returning to a normal healthy state: all pageservers will start, but only the one most
# recently attached via the control plane will re-attach on startup
for ps in pageservers:
ps.start()
with pytest.raises(PageserverApiException):
_detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
with pytest.raises(PageserverApiException):
_detail = http_clients[1].timeline_detail(tenant_id, timeline_id)
_detail = http_clients[2].timeline_detail(tenant_id, timeline_id)
# All data we wrote while multi-attached remains readable
workload.validate(pageservers[2].id)

View File

@@ -0,0 +1,477 @@
import random
from pathlib import Path
from typing import Any, Dict, Optional
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
from fixtures.remote_storage import RemoteStorageKind
from fixtures.types import TenantId, TimelineId
from fixtures.utils import wait_until
from fixtures.workload import Workload
# A tenant configuration that is convenient for generating uploads and deletions
# without a large amount of postgres traffic.
TENANT_CONF = {
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": f"{128 * 1024}",
"compaction_target_size": f"{128 * 1024}",
"compaction_threshold": "1",
# no PITR horizon, we specify the horizon when we request on-demand GC
"pitr_interval": "0s",
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# create image layers eagerly, so that GC can remove some layers
"image_creation_threshold": "1",
}
def evict_random_layers(
rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId
):
"""
Evict 50% of the layers on a pageserver
"""
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
initial_local_layers = sorted(
list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
)
client = pageserver.http_client()
for layer in initial_local_layers:
if "ephemeral" in layer.name:
continue
if rng.choice([True, False]):
log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}")
client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name)
@pytest.mark.parametrize("seed", [1, 2, 3])
def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
"""
Issue many location configuration changes, ensure that tenants
remain readable & we don't get any unexpected errors. We should
have no ERROR in the log, and no 500s in the API.
The location_config API is intentionally designed so that all destination
states are valid, so that we may test it in this way: the API should always
work as long as the tenant exists.
"""
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 3
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
pageservers = env.pageservers
list([p.http_client() for p in pageservers])
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# We will make no effort to avoid stale attachments
for ps in env.pageservers:
ps.allowed_errors.extend(
[
".*Dropped remote consistent LSN updates.*",
".*Dropping stale deletions.*",
# page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
".*query handler.*Tenant.*not found.*",
# page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
".*query handler.*Tenant.*not active.*",
]
)
# these can happen, if we shutdown at a good time. to be fixed as part of #5172.
message = ".*duplicated L1 layer layer=.*"
ps.allowed_errors.append(message)
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
workload.write_rows(256, env.pageservers[0].id)
# We use a fixed seed to make the test reproducible: we want a randomly
# chosen order, but not to change the order every time we run the test.
rng = random.Random(seed)
initial_generation = 1
last_state = {
env.pageservers[0].id: ("AttachedSingle", initial_generation),
env.pageservers[1].id: ("Detached", None),
env.pageservers[2].id: ("Detached", None),
}
latest_attached = env.pageservers[0].id
for _i in range(0, 64):
# Pick a pageserver
pageserver = rng.choice(env.pageservers)
# Pick a pseudorandom state
modes = [
"AttachedSingle",
"AttachedMulti",
"AttachedStale",
"Secondary",
"Detached",
"_Evictions",
"_Restart",
]
mode = rng.choice(modes)
last_state_ps = last_state[pageserver.id]
if mode == "_Evictions":
if last_state_ps[0].startswith("Attached"):
log.info(f"Action: evictions on pageserver {pageserver.id}")
evict_random_layers(rng, pageserver, tenant_id, timeline_id)
else:
log.info(
f"Action: skipping evictions on pageserver {pageserver.id}, is not attached"
)
elif mode == "_Restart":
log.info(f"Action: restarting pageserver {pageserver.id}")
pageserver.stop()
pageserver.start()
if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id:
log.info("Entering postgres...")
workload.churn_rows(rng.randint(128, 256), pageserver.id)
workload.validate(pageserver.id)
elif last_state_ps[0].startswith("Attached"):
# The `attachment_service` will only re-attach on startup when a pageserver was the
# holder of the latest generation: otherwise the pageserver will revert to detached
# state if it was running attached with a stale generation
last_state[pageserver.id] = ("Detached", None)
else:
secondary_conf: Optional[Dict[str, Any]] = None
if mode == "Secondary":
secondary_conf = {"warm": rng.choice([True, False])}
location_conf: Dict[str, Any] = {
"mode": mode,
"secondary_conf": secondary_conf,
"tenant_conf": {},
}
log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}")
# Select a generation number
if mode.startswith("Attached"):
if last_state_ps[1] is not None:
if rng.choice([True, False]):
# Move between attached states, staying in the same generation
generation = last_state_ps[1]
else:
# Switch generations, while also jumping between attached states
generation = env.attachment_service.attach_hook(tenant_id, pageserver.id)
latest_attached = pageserver.id
else:
generation = env.attachment_service.attach_hook(tenant_id, pageserver.id)
latest_attached = pageserver.id
else:
generation = None
location_conf["generation"] = generation
pageserver.tenant_location_configure(tenant_id, location_conf)
last_state[pageserver.id] = (mode, generation)
if mode.startswith("Attached"):
# TODO: a variant of this test that runs background endpoint workloads, as well as
# the inter-step workloads.
workload.churn_rows(
rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale"
)
workload.validate(pageserver.id)
# Attach all pageservers
for ps in env.pageservers:
location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}}
ps.tenant_location_configure(tenant_id, location_conf)
# Confirm that all are readable
for ps in env.pageservers:
workload.validate(ps.id)
# Detach all pageservers
for ps in env.pageservers:
location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}}
ps.tenant_location_configure(tenant_id, location_conf)
# Confirm that all local disk state was removed on detach
# TODO
def test_live_migration(neon_env_builder: NeonEnvBuilder):
"""
Test the sequence of location states that are used in a live migration.
"""
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
pageserver_a = env.pageservers[0]
pageserver_b = env.pageservers[1]
initial_generation = 1
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
workload.write_rows(256, env.pageservers[0].id)
# Make the destination a secondary location
pageserver_b.tenant_location_configure(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
},
)
workload.churn_rows(64, pageserver_a.id, upload=False)
# Set origin attachment to stale
log.info("Setting origin to AttachedStale")
pageserver_a.tenant_location_configure(
tenant_id,
{
"mode": "AttachedStale",
"secondary_conf": None,
"tenant_conf": {},
"generation": initial_generation,
},
flush_ms=5000,
)
migrated_generation = env.attachment_service.attach_hook(tenant_id, pageserver_b.id)
log.info(f"Acquired generation {migrated_generation} for destination pageserver")
assert migrated_generation == initial_generation + 1
# Writes and reads still work in AttachedStale.
workload.validate(pageserver_a.id)
# Ensure that secondary location's timeline directory is populated: we will then
# do some more writes on top of that to ensure that the newly attached pageserver
# properly makes use of the downloaded layers as well as ingesting WAL to catch up.
pageserver_a.http_client().secondary_tenant_upload(tenant_id)
pageserver_b.http_client().secondary_tenant_download(tenant_id)
# Generate some more dirty writes: we expect the origin to ingest WAL in
# in AttachedStale
workload.churn_rows(64, pageserver_a.id, upload=False)
workload.validate(pageserver_a.id)
# Attach the destination
log.info("Setting destination to AttachedMulti")
pageserver_b.tenant_location_configure(
tenant_id,
{
"mode": "AttachedMulti",
"secondary_conf": None,
"tenant_conf": {},
"generation": migrated_generation,
},
)
# Wait for destination LSN to catch up with origin
origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[
"last_record_lsn"
]
def caught_up():
destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[
"last_record_lsn"
]
log.info(
f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}"
)
assert destination_lsn >= origin_lsn
wait_until(100, 0.1, caught_up)
# The destination should accept writes
workload.churn_rows(64, pageserver_b.id)
# Dual attached: both are readable.
workload.validate(pageserver_a.id)
workload.validate(pageserver_b.id)
# Revert the origin to secondary
log.info("Setting origin to Secondary")
pageserver_a.tenant_location_configure(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
},
)
workload.churn_rows(64, pageserver_b.id)
# Put the destination into final state
pageserver_b.tenant_location_configure(
tenant_id,
{
"mode": "AttachedSingle",
"secondary_conf": None,
"tenant_conf": {},
"generation": migrated_generation,
},
)
workload.churn_rows(64, pageserver_b.id)
workload.validate(pageserver_b.id)
def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
"""
Inspect local storage on a pageserver to discover which layer files are present.
:return: list of relative paths to layers, from the timeline root.
"""
timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
def relative(p: Path) -> Path:
return p.relative_to(timeline_path)
return sorted(
list(
map(
relative,
filter(
lambda path: path.name != "metadata"
and "ephemeral" not in path.name
and "temp" not in path.name,
timeline_path.glob("*"),
),
)
)
)
def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
"""
Test the overall data flow in secondary mode:
- Heatmap uploads from the attached location
- Heatmap & layer downloads from the secondary location
- Eviction of layers on the attached location results in deletion
on the secondary location as well.
"""
neon_env_builder.enable_generations = True
neon_env_builder.num_pageservers = 2
neon_env_builder.enable_pageserver_remote_storage(
remote_storage_kind=RemoteStorageKind.MOCK_S3,
)
env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
assert env.attachment_service is not None
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
ps_attached = env.pageservers[0]
ps_secondary = env.pageservers[1]
workload = Workload(env, tenant_id, timeline_id)
workload.init(env.pageservers[0].id)
workload.write_rows(256, ps_attached.id)
# Configure a secondary location
log.info("Setting up secondary location...")
ps_secondary.tenant_location_configure(
tenant_id,
{
"mode": "Secondary",
"secondary_conf": {"warm": True},
"tenant_conf": {},
},
)
readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
log.info(f"Read back conf: {readback_conf}")
# Explicit upload/download cycle
# ==============================
log.info("Synchronizing after initial write...")
ps_attached.http_client().secondary_tenant_upload(tenant_id)
ps_secondary.http_client().secondary_tenant_download(tenant_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
# Make changes on attached pageserver, check secondary downloads them
# ===================================================================
log.info("Synchronizing after subsequent write...")
workload.churn_rows(128, ps_attached.id)
ps_attached.http_client().secondary_tenant_upload(tenant_id)
ps_secondary.http_client().secondary_tenant_download(tenant_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
# FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
# walreceiver is still doing something.
import time
time.sleep(5)
# Do evictions on attached pageserver, check secondary follows along
# ==================================================================
log.info("Evicting a layer...")
layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
log.info("Synchronizing after eviction...")
ps_attached.http_client().secondary_tenant_upload(tenant_id)
ps_secondary.http_client().secondary_tenant_download(tenant_id)
assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
ps_secondary, tenant_id, timeline_id
)
# Scrub the remote storage
# ========================
# This confirms that the scrubber isn't upset by the presence of the heatmap
# TODO: depends on `jcsp/scrubber-index-part` branch.
# Detach secondary and delete tenant
# ===================================
# This confirms that the heatmap gets cleaned up as well as other normal content.
log.info("Detaching secondary location...")
ps_secondary.tenant_location_configure(
tenant_id,
{
"mode": "Detached",
"secondary_conf": None,
"tenant_conf": {},
},
)
log.info("Deleting tenant...")
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
assert_prefix_empty(
neon_env_builder,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)