diff --git a/.dockerignore b/.dockerignore index 8b378b5dab..f7a6232ba1 100644 --- a/.dockerignore +++ b/.dockerignore @@ -22,6 +22,7 @@ !s3_scrubber/ !safekeeper/ !storage_broker/ +!storage_controller/ !trace/ !vendor/postgres-*/ !workspace_hack/ diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 2e56bf909f..1eaf05cd54 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -147,15 +147,16 @@ jobs: "neonvm-captest-new" ], "db_size": [ "10gb" ], - "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, - { "platform": "neon-captest-new", "db_size": "50gb" }, - { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, - { "platform": "neonvm-captest-new", "db_size": "50gb" }] + "include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" }, + { "platform": "neon-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-freetier", "db_size": "3gb" }, + { "platform": "neonvm-captest-new", "db_size": "50gb" }, + { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }] }' if [ "$(date +%A)" = "Saturday" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"}, - { "platform": "rds-aurora", "db_size": "50gb"}]') + { "platform": "rds-aurora", "db_size": "50gb"}]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -171,7 +172,7 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" }, - { "platform": "rds-aurora" }]') + { "platform": "rds-aurora" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -190,7 +191,7 @@ jobs: if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" }, - { "platform": "rds-aurora", "scale": "10" }]') + { "platform": "rds-aurora", "scale": "10" }]') fi echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT @@ -253,6 +254,9 @@ jobs: neon-captest-reuse) CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }} ;; + neonvm-captest-sharding-reuse) + CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }} + ;; neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier) CONNSTR=${{ steps.create-neon-project.outputs.dsn }} ;; @@ -270,11 +274,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Benchmark init uses: ./.github/actions/run-python-test-set @@ -401,11 +409,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set @@ -507,11 +519,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set @@ -597,11 +613,15 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERY="SELECT version();" + QUERIES=("SELECT version()") if [[ "${PLATFORM}" = "neon"* ]]; then - QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;" + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") fi - psql ${CONNSTR} -c "${QUERY}" + + for q in "${QUERIES[@]}"; do + psql ${CONNSTR} -c "${q}" + done - name: Run user examples uses: ./.github/actions/run-python-test-set diff --git a/CODEOWNERS b/CODEOWNERS index 9a23e8c958..af2fa6088e 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,5 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute -/control_plane/attachment_service @neondatabase/storage +/storage_controller @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers /libs/remote_storage/ @neondatabase/storage diff --git a/Cargo.lock b/Cargo.lock index c1c245fa9c..dae406e4ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,44 +270,6 @@ dependencies = [ "critical-section", ] -[[package]] -name = "attachment_service" -version = "0.1.0" -dependencies = [ - "anyhow", - "aws-config", - "bytes", - "camino", - "clap", - "control_plane", - "diesel", - "diesel_migrations", - "fail", - "futures", - "git-version", - "hex", - "humantime", - "hyper", - "lasso", - "measured", - "metrics", - "once_cell", - "pageserver_api", - "pageserver_client", - "postgres_connection", - "r2d2", - "reqwest", - "routerify", - "serde", - "serde_json", - "thiserror", - "tokio", - "tokio-util", - "tracing", - "utils", - "workspace_hack", -] - [[package]] name = "autocfg" version = "1.1.0" @@ -2234,9 +2196,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.24" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", @@ -3435,9 +3397,9 @@ dependencies = [ [[package]] name = "ordered-multimap" -version = "0.7.1" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d6a8c22fc714f0c2373e6091bf6f5e9b37b1bc0b1184874b7e0a4e303d318f" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" dependencies = [ "dlv-list", "hashbrown 0.14.0", @@ -4199,6 +4161,7 @@ name = "proxy" version = "0.1.0" dependencies = [ "anyhow", + "async-compression", "async-trait", "aws-config", "aws-sdk-iam", @@ -5621,6 +5584,65 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_controller" +version = "0.1.0" +dependencies = [ + "anyhow", + "aws-config", + "bytes", + "camino", + "clap", + "control_plane", + "diesel", + "diesel_migrations", + "fail", + "futures", + "git-version", + "hex", + "humantime", + "hyper", + "itertools", + "lasso", + "measured", + "metrics", + "once_cell", + "pageserver_api", + "pageserver_client", + "postgres_connection", + "r2d2", + "reqwest", + "routerify", + "serde", + "serde_json", + "thiserror", + "tokio", + "tokio-util", + "tracing", + "utils", + "workspace_hack", +] + +[[package]] +name = "storcon_cli" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "comfy-table", + "hyper", + "pageserver_api", + "pageserver_client", + "reqwest", + "serde", + "serde_json", + "thiserror", + "tokio", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "stringprep" version = "0.1.2" @@ -5777,23 +5799,23 @@ dependencies = [ [[package]] name = "test-context" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +checksum = "6676ab8513edfd2601a108621103fdb45cac9098305ca25ec93f7023b06b05d9" dependencies = [ - "async-trait", "futures", "test-context-macros", ] [[package]] name = "test-context-macros" -version = "0.1.4" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1" dependencies = [ + "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] @@ -5934,9 +5956,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.37.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" dependencies = [ "backtrace", "bytes", diff --git a/Cargo.toml b/Cargo.toml index 309ebbe119..3c6077648e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = [ "compute_tools", "control_plane", - "control_plane/attachment_service", + "control_plane/storcon_cli", "pageserver", "pageserver/compaction", "pageserver/ctl", @@ -12,6 +12,7 @@ members = [ "proxy", "safekeeper", "storage_broker", + "storage_controller", "s3_scrubber", "workspace_hack", "trace", @@ -158,7 +159,7 @@ svg_fmt = "0.4.1" sync_wrapper = "0.1.2" tar = "0.4" task-local-extensions = "0.1.4" -test-context = "0.1" +test-context = "0.3" thiserror = "1.0" tikv-jemallocator = "0.5" tikv-jemalloc-ctl = "0.5" diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 401feae706..56495dd2da 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -14,9 +14,7 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR}; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::StorageController; use control_plane::{broker, local_env}; -use pageserver_api::controller_api::{ - NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy, -}; +use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::models::{ ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, }; @@ -1060,21 +1058,6 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> } } - Some(("set-state", subcommand_args)) => { - let pageserver = get_pageserver(env, subcommand_args)?; - let scheduling = subcommand_args.get_one("scheduling"); - let availability = subcommand_args.get_one("availability"); - - let storage_controller = StorageController::from_env(env); - storage_controller - .node_configure(NodeConfigureRequest { - node_id: pageserver.conf.id, - scheduling: scheduling.cloned(), - availability: availability.cloned(), - }) - .await?; - } - Some(("status", subcommand_args)) => { match get_pageserver(env, subcommand_args)?.check_status().await { Ok(_) => println!("Page server is up and running"), @@ -1515,12 +1498,6 @@ fn cli() -> Command { .about("Restart local pageserver") .arg(pageserver_config_args.clone()) ) - .subcommand(Command::new("set-state") - .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active")) - .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active")) - .about("Set scheduling or availability state of pageserver node") - .arg(pageserver_config_args.clone()) - ) ) .subcommand( Command::new("storage_controller") diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml new file mode 100644 index 0000000000..61eb7fa4e4 --- /dev/null +++ b/control_plane/storcon_cli/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "storcon_cli" +version = "0.1.0" +edition.workspace = true +license.workspace = true + + +[dependencies] +anyhow.workspace = true +clap.workspace = true +comfy-table.workspace = true +hyper.workspace = true +pageserver_api.workspace = true +pageserver_client.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json = { workspace = true, features = ["raw_value"] } +thiserror.workspace = true +tokio.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack.workspace = true + diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs new file mode 100644 index 0000000000..2edd09eac1 --- /dev/null +++ b/control_plane/storcon_cli/src/main.rs @@ -0,0 +1,587 @@ +use std::{collections::HashMap, str::FromStr}; + +use clap::{Parser, Subcommand}; +use hyper::Method; +use pageserver_api::{ + controller_api::{ + NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, + TenantDescribeResponse, TenantPolicyRequest, + }, + models::{ + ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, + TenantShardSplitRequest, TenantShardSplitResponse, + }, + shard::{ShardStripeSize, TenantShardId}, +}; +use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt}; +use reqwest::Url; +use serde::{de::DeserializeOwned, Serialize}; +use utils::id::{NodeId, TenantId}; + +use pageserver_api::controller_api::{ + NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, + TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, +}; + +#[derive(Subcommand, Debug)] +enum Command { + /// Register a pageserver with the storage controller. This shouldn't usually be necessary, + /// since pageservers auto-register when they start up + NodeRegister { + #[arg(long)] + node_id: NodeId, + + #[arg(long)] + listen_pg_addr: String, + #[arg(long)] + listen_pg_port: u16, + + #[arg(long)] + listen_http_addr: String, + #[arg(long)] + listen_http_port: u16, + }, + + /// Modify a node's configuration in the storage controller + NodeConfigure { + #[arg(long)] + node_id: NodeId, + + /// Availability is usually auto-detected based on heartbeats. Set 'offline' here to + /// manually mark a node offline + #[arg(long)] + availability: Option, + /// Scheduling policy controls whether tenant shards may be scheduled onto this node. + #[arg(long)] + scheduling: Option, + }, + /// Modify a tenant's policies in the storage controller + TenantPolicy { + #[arg(long)] + tenant_id: TenantId, + /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`), + /// or is in the normal attached state with N secondary locations (`attached:N`) + #[arg(long)] + placement: Option, + /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant. `active` is normal, + /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents + /// all reconciliation activity including for scheduling changes already made. `pause` and `stop` can make a tenant + /// unavailable, and are only for use in emergencies. + #[arg(long)] + scheduling: Option, + }, + /// List nodes known to the storage controller + Nodes {}, + /// List tenants known to the storage controller + Tenants {}, + /// Create a new tenant in the storage controller, and by extension on pageservers. + TenantCreate { + #[arg(long)] + tenant_id: TenantId, + }, + /// Delete a tenant in the storage controller, and by extension on pageservers. + TenantDelete { + #[arg(long)] + tenant_id: TenantId, + }, + /// Split an existing tenant into a higher number of shards than its current shard count. + TenantShardSplit { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + shard_count: u8, + /// Optional, in 8kiB pages. e.g. set 2048 for 16MB stripes. + #[arg(long)] + stripe_size: Option, + }, + /// Migrate the attached location for a tenant shard to a specific pageserver. + TenantShardMigrate { + #[arg(long)] + tenant_shard_id: TenantShardId, + #[arg(long)] + node: NodeId, + }, + /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure + /// that is passed through to pageservers, and does not affect storage controller behavior. + TenantConfig { + #[arg(long)] + tenant_id: TenantId, + #[arg(long)] + config: String, + }, + /// Attempt to balance the locations for a tenant across pageservers. This is a client-side + /// alternative to the storage controller's scheduling optimization behavior. + TenantScatter { + #[arg(long)] + tenant_id: TenantId, + }, + /// Print details about a particular tenant, including all its shards' states. + TenantDescribe { + #[arg(long)] + tenant_id: TenantId, + }, +} + +#[derive(Parser)] +#[command( + author, + version, + about, + long_about = "CLI for Storage Controller Support/Debug" +)] +#[command(arg_required_else_help(true))] +struct Cli { + #[arg(long)] + /// URL to storage controller. e.g. http://127.0.0.1:1234 when using `neon_local` + api: Url, + + #[arg(long)] + /// JWT token for authenticating with storage controller. Depending on the API used, this + /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint + /// a token with both scopes to use with this tool. + jwt: Option, + + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Clone)] +struct PlacementPolicyArg(PlacementPolicy); + +impl FromStr for PlacementPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "detached" => Ok(Self(PlacementPolicy::Detached)), + "secondary" => Ok(Self(PlacementPolicy::Secondary)), + _ if s.starts_with("attached:") => { + let mut splitter = s.split(':'); + let _prefix = splitter.next().unwrap(); + match splitter.next().and_then(|s| s.parse::().ok()) { + Some(n) => Ok(Self(PlacementPolicy::Attached(n))), + None => Err(anyhow::anyhow!( + "Invalid format '{s}', a valid example is 'attached:1'" + )), + } + } + _ => Err(anyhow::anyhow!( + "Unknown placement policy '{s}', try detached,secondary,attached:" + )), + } + } +} + +#[derive(Debug, Clone)] +struct ShardSchedulingPolicyArg(ShardSchedulingPolicy); + +impl FromStr for ShardSchedulingPolicyArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(ShardSchedulingPolicy::Active)), + "essential" => Ok(Self(ShardSchedulingPolicy::Essential)), + "pause" => Ok(Self(ShardSchedulingPolicy::Pause)), + "stop" => Ok(Self(ShardSchedulingPolicy::Stop)), + _ => Err(anyhow::anyhow!( + "Unknown scheduling policy '{s}', try active,essential,pause,stop" + )), + } + } +} + +#[derive(Debug, Clone)] +struct NodeAvailabilityArg(NodeAvailabilityWrapper); + +impl FromStr for NodeAvailabilityArg { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + match s { + "active" => Ok(Self(NodeAvailabilityWrapper::Active)), + "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)), + _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), + } + } +} + +struct Client { + base_url: Url, + jwt_token: Option, + client: reqwest::Client, +} + +impl Client { + fn new(base_url: Url, jwt_token: Option) -> Self { + Self { + base_url, + jwt_token, + client: reqwest::ClientBuilder::new() + .build() + .expect("Failed to construct http client"), + } + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch( + &self, + method: hyper::Method, + path: String, + body: Option, + ) -> mgmt_api::Result + where + RQ: Serialize + Sized, + RS: DeserializeOwned + Sized, + { + // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out + // for general purpose API access. + let url = Url::from_str(&format!( + "http://{}:{}/{path}", + self.base_url.host_str().unwrap(), + self.base_url.port().unwrap() + )) + .unwrap(); + + let mut builder = self.client.request(method, url); + if let Some(body) = body { + builder = builder.json(&body) + } + if let Some(jwt_token) = &self.jwt_token { + builder = builder.header( + reqwest::header::AUTHORIZATION, + format!("Bearer {jwt_token}"), + ); + } + + let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?; + let response = response.error_from_body().await?; + + response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody) + } +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + let cli = Cli::parse(); + + let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone()); + + let mut trimmed = cli.api.to_string(); + trimmed.pop(); + let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref()); + + match cli.command { + Command::NodeRegister { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + } => { + storcon_client + .dispatch::<_, ()>( + Method::POST, + "control/v1/node".to_string(), + Some(NodeRegisterRequest { + node_id, + listen_pg_addr, + listen_pg_port, + listen_http_addr, + listen_http_port, + }), + ) + .await?; + } + Command::TenantCreate { tenant_id } => { + vps_client + .tenant_create(&TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters::default(), + placement_policy: Some(PlacementPolicy::Attached(1)), + config: TenantConfig::default(), + }) + .await?; + } + Command::TenantDelete { tenant_id } => { + let status = vps_client + .tenant_delete(TenantShardId::unsharded(tenant_id)) + .await?; + tracing::info!("Delete status: {}", status); + } + Command::Nodes {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header(["Id", "Hostname", "Scheduling", "Availability"]); + for node in resp { + table.add_row([ + format!("{}", node.id), + node.listen_http_addr, + format!("{:?}", node.scheduling), + format!("{:?}", node.availability), + ]); + } + println!("{table}"); + } + Command::NodeConfigure { + node_id, + availability, + scheduling, + } => { + let req = NodeConfigureRequest { + node_id, + availability: availability.map(|a| a.0), + scheduling, + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{node_id}/config"), + Some(req), + ) + .await?; + } + Command::Tenants {} => { + let resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + let mut table = comfy_table::Table::new(); + table.set_header([ + "TenantId", + "ShardCount", + "StripeSize", + "Placement", + "Scheduling", + ]); + for tenant in resp { + let shard_zero = tenant.shards.into_iter().next().unwrap(); + table.add_row([ + format!("{}", tenant.tenant_id), + format!("{}", shard_zero.tenant_shard_id.shard_count.literal()), + format!("{:?}", tenant.stripe_size), + format!("{:?}", tenant.policy), + format!("{:?}", shard_zero.scheduling_policy), + ]); + } + + println!("{table}"); + } + Command::TenantPolicy { + tenant_id, + placement, + scheduling, + } => { + let req = TenantPolicyRequest { + scheduling: scheduling.map(|s| s.0), + placement: placement.map(|p| p.0), + }; + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/policy"), + Some(req), + ) + .await?; + } + Command::TenantShardSplit { + tenant_id, + shard_count, + stripe_size, + } => { + let req = TenantShardSplitRequest { + new_shard_count: shard_count, + new_stripe_size: stripe_size.map(ShardStripeSize), + }; + + let response = storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_id}/shard_split"), + Some(req), + ) + .await?; + println!( + "Split tenant {} into {} shards: {}", + tenant_id, + shard_count, + response + .new_shards + .iter() + .map(|s| format!("{:?}", s)) + .collect::>() + .join(",") + ); + } + Command::TenantShardMigrate { + tenant_shard_id, + node, + } => { + let req = TenantShardMigrateRequest { + tenant_shard_id, + node_id: node, + }; + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/migrate"), + Some(req), + ) + .await?; + } + Command::TenantConfig { tenant_id, config } => { + let tenant_conf = serde_json::from_str(&config)?; + + vps_client + .tenant_config(&TenantConfigRequest { + tenant_id, + config: tenant_conf, + }) + .await?; + } + Command::TenantScatter { tenant_id } => { + // Find the shards + let locate_response = storcon_client + .dispatch::<(), TenantLocateResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}/locate"), + None, + ) + .await?; + let shards = locate_response.shards; + + let mut node_to_shards: HashMap> = HashMap::new(); + let shard_count = shards.len(); + for s in shards { + let entry = node_to_shards.entry(s.node_id).or_default(); + entry.push(s.shard_id); + } + + // Load list of available nodes + let nodes_resp = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + for node in nodes_resp { + if matches!(node.availability, NodeAvailabilityWrapper::Active) { + node_to_shards.entry(node.id).or_default(); + } + } + + let max_shard_per_node = shard_count / node_to_shards.len(); + + loop { + let mut migrate_shard = None; + for shards in node_to_shards.values_mut() { + if shards.len() > max_shard_per_node { + // Pick the emptiest + migrate_shard = Some(shards.pop().unwrap()); + } + } + let Some(migrate_shard) = migrate_shard else { + break; + }; + + // Pick the emptiest node to migrate to + let mut destinations = node_to_shards + .iter() + .map(|(k, v)| (k, v.len())) + .collect::>(); + destinations.sort_by_key(|i| i.1); + let (destination_node, destination_count) = *destinations.first().unwrap(); + if destination_count + 1 > max_shard_per_node { + // Even the emptiest destination doesn't have space: we're done + break; + } + let destination_node = *destination_node; + + node_to_shards + .get_mut(&destination_node) + .unwrap() + .push(migrate_shard); + + println!("Migrate {} -> {} ...", migrate_shard, destination_node); + + storcon_client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{migrate_shard}/migrate"), + Some(TenantShardMigrateRequest { + tenant_shard_id: migrate_shard, + node_id: destination_node, + }), + ) + .await?; + println!("Migrate {} -> {} OK", migrate_shard, destination_node); + } + + // Spread the shards across the nodes + } + Command::TenantDescribe { tenant_id } => { + let describe_response = storcon_client + .dispatch::<(), TenantDescribeResponse>( + Method::GET, + format!("control/v1/tenant/{tenant_id}"), + None, + ) + .await?; + let shards = describe_response.shards; + let mut table = comfy_table::Table::new(); + table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]); + for shard in shards { + let secondary = shard + .node_secondary + .iter() + .map(|n| format!("{}", n)) + .collect::>() + .join(","); + + let mut status_parts = Vec::new(); + if shard.is_reconciling { + status_parts.push("reconciling"); + } + + if shard.is_pending_compute_notification { + status_parts.push("pending_compute"); + } + + if shard.is_splitting { + status_parts.push("splitting"); + } + let status = status_parts.join(","); + + table.add_row([ + format!("{}", shard.tenant_shard_id), + shard + .node_attached + .map(|n| format!("{}", n)) + .unwrap_or(String::new()), + secondary, + shard.last_error, + status, + ]); + } + println!("{table}"); + } + } + + Ok(()) +} diff --git a/diesel.toml b/diesel.toml index 30ed4444d7..558c54a1e1 100644 --- a/diesel.toml +++ b/diesel.toml @@ -2,8 +2,8 @@ # see https://diesel.rs/guides/configuring-diesel-cli [print_schema] -file = "control_plane/attachment_service/src/schema.rs" +file = "storage_controller/src/schema.rs" custom_type_derives = ["diesel::query_builder::QueryId"] [migrations_directory] -dir = "control_plane/attachment_service/migrations" +dir = "storage_controller/migrations" diff --git a/docs/sourcetree.md b/docs/sourcetree.md index 12fa80349e..3732bfdab2 100644 --- a/docs/sourcetree.md +++ b/docs/sourcetree.md @@ -7,6 +7,11 @@ Below you will find a brief overview of each subdir in the source tree in alphab Neon storage broker, providing messaging between safekeepers and pageservers. [storage_broker.md](./storage_broker.md) +`storage_controller`: + +Neon storage controller, manages a cluster of pageservers and exposes an API that enables +managing a many-sharded tenant as a single entity. + `/control_plane`: Local control plane. diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index dcf9e38106..1278f17ad2 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -2,9 +2,9 @@ use std::str::FromStr; /// Request/response types for the storage controller /// API (`/control/v1` prefix). Implemented by the server -/// in [`attachment_service::http`] +/// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; -use utils::id::NodeId; +use utils::id::{NodeId, TenantId}; use crate::{ models::{ShardParameters, TenantConfig}, @@ -68,12 +68,27 @@ pub struct TenantLocateResponse { #[derive(Serialize, Deserialize)] pub struct TenantDescribeResponse { + pub tenant_id: TenantId, pub shards: Vec, pub stripe_size: ShardStripeSize, pub policy: PlacementPolicy, pub config: TenantConfig, } +#[derive(Serialize, Deserialize)] +pub struct NodeDescribeResponse { + pub id: NodeId, + + pub availability: NodeAvailabilityWrapper, + pub scheduling: NodeSchedulingPolicy, + + pub listen_http_addr: String, + pub listen_http_port: u16, + + pub listen_pg_addr: String, + pub listen_pg_port: u16, +} + #[derive(Serialize, Deserialize)] pub struct TenantDescribeResponseShard { pub tenant_shard_id: TenantShardId, @@ -89,6 +104,8 @@ pub struct TenantDescribeResponseShard { pub is_pending_compute_notification: bool, /// A shard split is currently underway pub is_splitting: bool, + + pub scheduling_policy: ShardSchedulingPolicy, } /// Explicitly migrating a particular shard is a low level operation @@ -103,7 +120,7 @@ pub struct TenantShardMigrateRequest { /// Utilisation score indicating how good a candidate a pageserver /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`]. /// Lower values are better. -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)] pub struct UtilizationScore(pub u64); impl UtilizationScore { @@ -112,7 +129,7 @@ impl UtilizationScore { } } -#[derive(Serialize, Clone, Copy)] +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] #[serde(into = "NodeAvailabilityWrapper")] pub enum NodeAvailability { // Normal, happy state @@ -135,7 +152,7 @@ impl Eq for NodeAvailability {} // This wrapper provides serde functionality and it should only be used to // communicate with external callers which don't know or care about the // utilisation score of the pageserver it is targeting. -#[derive(Serialize, Deserialize, Clone)] +#[derive(Serialize, Deserialize, Clone, Copy, Debug)] pub enum NodeAvailabilityWrapper { Active, Offline, @@ -161,21 +178,6 @@ impl From for NodeAvailabilityWrapper { } } -impl FromStr for NodeAvailability { - type Err = anyhow::Error; - - fn from_str(s: &str) -> Result { - match s { - // This is used when parsing node configuration requests from neon-local. - // Assume the worst possible utilisation score - // and let it get updated via the heartbeats. - "active" => Ok(Self::Active(UtilizationScore::worst())), - "offline" => Ok(Self::Offline), - _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")), - } - } -} - #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum ShardSchedulingPolicy { // Normal mode: the tenant's scheduled locations may be updated at will, including @@ -202,7 +204,7 @@ impl Default for ShardSchedulingPolicy { } } -#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum NodeSchedulingPolicy { Active, Filling, diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index ab2035f19a..e708854be2 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -565,6 +565,16 @@ impl GenericRemoteStorage { #[derive(Debug, Clone, PartialEq, Eq)] pub struct StorageMetadata(HashMap); +impl From<[(&str, &str); N]> for StorageMetadata { + fn from(arr: [(&str, &str); N]) -> Self { + let map: HashMap = arr + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + Self(map) + } +} + /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs index 6adddf52a9..6aa02868e6 100644 --- a/libs/remote_storage/tests/test_real_azure.rs +++ b/libs/remote_storage/tests/test_real_azure.rs @@ -57,7 +57,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -86,7 +85,6 @@ struct AzureWithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -148,7 +146,6 @@ struct AzureWithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index bc5e40e70f..c5d5216f00 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -219,7 +219,6 @@ enum MaybeEnabledStorage { Disabled, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorage { async fn setup() -> Self { ensure_logging_ready(); @@ -248,7 +247,6 @@ struct S3WithTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs { async fn setup() -> Self { ensure_logging_ready(); @@ -310,7 +308,6 @@ struct S3WithSimpleTestBlobs { remote_blobs: HashSet, } -#[async_trait::async_trait] impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs { async fn setup() -> Self { ensure_logging_ready(); diff --git a/libs/utils/src/seqwait.rs b/libs/utils/src/seqwait.rs index b7301776eb..0544c5be03 100644 --- a/libs/utils/src/seqwait.rs +++ b/libs/utils/src/seqwait.rs @@ -182,6 +182,18 @@ where } } + /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`. + pub fn would_wait_for(&self, num: V) -> Result<(), V> { + let internal = self.internal.lock().unwrap(); + let cnt = internal.current.cnt_value(); + drop(internal); + if cnt >= num { + Ok(()) + } else { + Err(cnt) + } + } + /// Register and return a channel that will be notified when a number arrives, /// or None, if it has already arrived. fn queue_for_wait(&self, num: V) -> Result>, SeqWaitError> { diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 3efad546a6..ffe607be4b 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -27,25 +27,25 @@ //! //! # Reference Numbers //! -//! 2024-03-20 on i3en.3xlarge +//! 2024-04-04 on i3en.3xlarge //! //! ```text -//! short/1 time: [26.483 µs 26.614 µs 26.767 µs] -//! short/2 time: [32.223 µs 32.465 µs 32.767 µs] -//! short/4 time: [47.203 µs 47.583 µs 47.984 µs] -//! short/8 time: [89.135 µs 89.612 µs 90.139 µs] -//! short/16 time: [190.12 µs 191.52 µs 192.88 µs] -//! short/32 time: [380.96 µs 382.63 µs 384.20 µs] -//! short/64 time: [736.86 µs 741.07 µs 745.03 µs] -//! short/128 time: [1.4106 ms 1.4206 ms 1.4294 ms] -//! medium/1 time: [111.81 µs 112.25 µs 112.79 µs] -//! medium/2 time: [158.26 µs 159.13 µs 160.21 µs] -//! medium/4 time: [334.65 µs 337.14 µs 340.07 µs] -//! medium/8 time: [675.32 µs 679.91 µs 685.25 µs] -//! medium/16 time: [1.2929 ms 1.2996 ms 1.3067 ms] -//! medium/32 time: [2.4295 ms 2.4461 ms 2.4623 ms] -//! medium/64 time: [4.3973 ms 4.4458 ms 4.4875 ms] -//! medium/128 time: [7.5955 ms 7.7847 ms 7.9481 ms] +//! short/1 time: [25.925 µs 26.060 µs 26.209 µs] +//! short/2 time: [31.277 µs 31.483 µs 31.722 µs] +//! short/4 time: [45.496 µs 45.831 µs 46.182 µs] +//! short/8 time: [84.298 µs 84.920 µs 85.566 µs] +//! short/16 time: [185.04 µs 186.41 µs 187.88 µs] +//! short/32 time: [385.01 µs 386.77 µs 388.70 µs] +//! short/64 time: [770.24 µs 773.04 µs 776.04 µs] +//! short/128 time: [1.5017 ms 1.5064 ms 1.5113 ms] +//! medium/1 time: [106.65 µs 107.20 µs 107.85 µs] +//! medium/2 time: [153.28 µs 154.24 µs 155.56 µs] +//! medium/4 time: [325.67 µs 327.01 µs 328.71 µs] +//! medium/8 time: [646.82 µs 650.17 µs 653.91 µs] +//! medium/16 time: [1.2645 ms 1.2701 ms 1.2762 ms] +//! medium/32 time: [2.4409 ms 2.4550 ms 2.4692 ms] +//! medium/64 time: [4.6814 ms 4.7114 ms 4.7408 ms] +//! medium/128 time: [8.7790 ms 8.9037 ms 9.0282 ms] //! ``` use bytes::{Buf, Bytes}; diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index ab55d2b0a3..3c9982ffb8 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -128,12 +128,12 @@ impl Client { pub async fn timeline_info( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, force_await_logical_size: ForceAwaitLogicalSize, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}", self.mgmt_api_endpoint ); @@ -151,11 +151,11 @@ impl Client { pub async fn keyspace( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result { let uri = format!( - "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/keyspace", + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/keyspace", self.mgmt_api_endpoint ); self.get(&uri) diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs index 55844be041..3ae6d99aa7 100644 --- a/pageserver/pagebench/src/cmd/basebackup.rs +++ b/pageserver/pagebench/src/cmd/basebackup.rs @@ -1,4 +1,5 @@ use anyhow::Context; +use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ForceAwaitLogicalSize; use pageserver_client::page_service::BasebackupRequest; @@ -95,7 +96,7 @@ async fn main_impl( let timeline = *timeline; let info = mgmt_api_client .timeline_info( - timeline.tenant_id, + TenantShardId::unsharded(timeline.tenant_id), timeline.timeline_id, ForceAwaitLogicalSize::No, ) diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 2838511a77..c3d8e61a2c 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -4,6 +4,7 @@ use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key}; use pageserver_api::keyspace::KeySpaceAccum; use pageserver_api::models::PagestreamGetPageRequest; +use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; use utils::id::TenantTimelineId; use utils::lsn::Lsn; @@ -173,7 +174,10 @@ async fn main_impl( let timeline = *timeline; async move { let partitioning = mgmt_api_client - .keyspace(timeline.tenant_id, timeline.timeline_id) + .keyspace( + TenantShardId::unsharded(timeline.tenant_id), + timeline.timeline_id, + ) .await?; let lsn = partitioning.at_lsn; let start = Instant::now(); diff --git a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs index 98938d780a..f07beeecfd 100644 --- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs +++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use humantime::Duration; +use pageserver_api::shard::TenantShardId; use tokio::task::JoinSet; use utils::id::TenantTimelineId; @@ -59,7 +60,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { let mgmt_api_client = Arc::clone(&mgmt_api_client); js.spawn(async move { let info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); @@ -74,7 +79,11 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { while !info.current_logical_size_is_accurate { ticker.tick().await; info = mgmt_api_client - .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes) + .timeline_info( + TenantShardId::unsharded(tl.tenant_id), + tl.timeline_id, + ForceAwaitLogicalSize::Yes, + ) .await .unwrap(); } diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index 42c800822b..f0ed46ce23 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -12,7 +12,7 @@ use pageserver_api::{ use serde::{de::DeserializeOwned, Serialize}; use tokio_util::sync::CancellationToken; use url::Url; -use utils::{backoff, generation::Generation, id::NodeId}; +use utils::{backoff, failpoint_support, generation::Generation, id::NodeId}; use crate::{ config::{NodeMetadata, PageServerConf}, @@ -210,7 +210,10 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { .collect(), }; - fail::fail_point!("control-plane-client-validate"); + failpoint_support::sleep_millis_async!("control-plane-client-validate-sleep", &self.cancel); + if self.cancel.is_cancelled() { + return Err(RetryForeverError::ShuttingDown); + } let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?; diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index bb477f89c5..2713309824 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -1629,7 +1629,7 @@ components: type: integer format: int64 minimum: 0 - description: The amount of disk space currently utilized by layer files. + description: The amount of disk space currently used. free_space_bytes: type: integer format: int64 diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 759a1b25ee..47d8ae1148 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -993,11 +993,26 @@ async fn tenant_status( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); + // In tests, sometimes we want to query the state of a tenant without auto-activating it if it's currently waiting. + let activate = true; + #[cfg(feature = "testing")] + let activate = parse_query_param(&request, "activate")?.unwrap_or(activate); + let tenant_info = async { let tenant = state .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + if activate { + // This is advisory: we prefer to let the tenant activate on-demand when this function is + // called, but it is still valid to return 200 and describe the current state of the tenant + // if it doesn't make it into an active state. + tenant + .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) + .await + .ok(); + } + // Calculate total physical size of all timelines let mut current_physical_size = 0; for timeline in tenant.list_timelines().iter() { diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 343dec2ca1..ed409d3130 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -8,6 +8,7 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use camino::Utf8Path; use futures::StreamExt; +use pageserver_api::key::rel_block_to_key; use tokio::io::{AsyncRead, AsyncReadExt}; use tokio_tar::Archive; use tracing::*; @@ -170,7 +171,10 @@ async fn import_rel( let r = reader.read_exact(&mut buf).await; match r { Ok(_) => { - modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + let key = rel_block_to_key(rel, blknum); + if modification.tline.get_shard_identity().is_key_local(&key) { + modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; + } } // TODO: UnexpectedEof is expected diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index cc661194e9..ab9a2e8509 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -1483,12 +1483,18 @@ pub(crate) static DELETION_QUEUE: Lazy = Lazy::new(|| { }); pub(crate) struct WalIngestMetrics { + pub(crate) bytes_received: IntCounter, pub(crate) records_received: IntCounter, pub(crate) records_committed: IntCounter, pub(crate) records_filtered: IntCounter, } pub(crate) static WAL_INGEST: Lazy = Lazy::new(|| WalIngestMetrics { + bytes_received: register_int_counter!( + "pageserver_wal_ingest_bytes_received", + "Bytes of WAL ingested from safekeepers", + ) + .unwrap(), records_received: register_int_counter!( "pageserver_wal_ingest_records_received", "Number of WAL records received from safekeepers" diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 3d622f1871..3b9a30ba4c 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -876,7 +876,13 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn, ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -888,7 +894,13 @@ impl PageServerHandler { "invalid LSN(0) in request".into(), )); } - timeline.wait_lsn(lsn, ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; } if lsn < **latest_gc_cutoff_lsn { @@ -1215,7 +1227,13 @@ impl PageServerHandler { if let Some(lsn) = lsn { // Backup was requested at a particular LSN. Wait for it to arrive. info!("waiting for {}", lsn); - timeline.wait_lsn(lsn, ctx).await?; + timeline + .wait_lsn( + lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + ctx, + ) + .await?; timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) .context("invalid basebackup lsn")?; diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 69e163effa..0cc5611a12 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -214,13 +214,12 @@ pub enum TaskKind { /// Internally, `Client` hands over requests to the `Connection` object. /// The `Connection` object is responsible for speaking the wire protocol. /// - /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. - /// That abstraction doesn't use `task_mgr`. + /// Walreceiver uses a legacy abstraction called `TaskHandle` to represent the activity of establishing and handling a connection. /// The `WalReceiverManager` task ensures that this `TaskHandle` task does not outlive the `WalReceiverManager` task. /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind. /// - /// Once the connection is established, the `TaskHandle` task creates a - /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling + /// Once the connection is established, the `TaskHandle` task spawns a + /// [`WalReceiverConnectionPoller`] task that is responsible for polling /// the `Connection` object. /// A `CancellationToken` created by the `TaskHandle` task ensures /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped. @@ -230,7 +229,6 @@ pub enum TaskKind { WalReceiverManager, /// The `TaskHandle` task that executes `handle_walreceiver_connection`. - /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`. /// See the comment on [`WalReceiverManager`]. /// /// [`WalReceiverManager`]: Self::WalReceiverManager diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0806ef0cf4..17ff033e00 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -12,6 +12,7 @@ //! use anyhow::{bail, Context}; +use arc_swap::ArcSwap; use camino::Utf8Path; use camino::Utf8PathBuf; use enumset::EnumSet; @@ -98,7 +99,7 @@ use std::ops::Bound::Included; use std::sync::atomic::AtomicU64; use std::sync::atomic::Ordering; use std::sync::Arc; -use std::sync::{Mutex, RwLock}; +use std::sync::Mutex; use std::time::{Duration, Instant}; use crate::span; @@ -260,7 +261,7 @@ pub struct Tenant { // We keep TenantConfOpt sturct here to preserve the information // about parameters that are not set. // This is necessary to allow global config updates. - tenant_conf: Arc>, + tenant_conf: Arc>, tenant_shard_id: TenantShardId, @@ -1515,7 +1516,7 @@ impl Tenant { // sizes etc. and that would get confused if the previous page versions // are not in the repository yet. ancestor_timeline - .wait_lsn(*lsn, ctx) + .wait_lsn(*lsn, timeline::WaitLsnWaiter::Tenant, ctx) .await .map_err(|e| match e { e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => { @@ -1606,7 +1607,7 @@ impl Tenant { ); { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() { info!("Skipping GC in location state {:?}", conf.location); @@ -1633,7 +1634,7 @@ impl Tenant { } { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() { info!("Skipping compaction in location state {:?}", conf.location); return Ok(()); @@ -1782,7 +1783,7 @@ impl Tenant { async fn shutdown( &self, shutdown_progress: completion::Barrier, - freeze_and_flush: bool, + shutdown_mode: timeline::ShutdownMode, ) -> Result<(), completion::Barrier> { span::debug_assert_current_span_has_tenant_id(); @@ -1829,16 +1830,8 @@ impl Tenant { timelines.values().for_each(|timeline| { let timeline = Arc::clone(timeline); let timeline_id = timeline.timeline_id; - - let span = - tracing::info_span!("timeline_shutdown", %timeline_id, ?freeze_and_flush); - js.spawn(async move { - if freeze_and_flush { - timeline.flush_and_shutdown().instrument(span).await - } else { - timeline.shutdown().instrument(span).await - } - }); + let span = tracing::info_span!("timeline_shutdown", %timeline_id, ?shutdown_mode); + js.spawn(async move { timeline.shutdown(shutdown_mode).instrument(span).await }); }) }; // test_long_timeline_create_then_tenant_delete is leaning on this message @@ -2082,14 +2075,14 @@ impl Tenant { } pub(crate) fn get_attach_mode(&self) -> AttachmentMode { - self.tenant_conf.read().unwrap().location.attach_mode + self.tenant_conf.load().location.attach_mode } /// For API access: generate a LocationConfig equivalent to the one that would be used to /// create a Tenant in the same state. Do not use this in hot paths: it's for relatively /// rare external API calls, like a reconciliation at startup. pub(crate) fn get_location_conf(&self) -> models::LocationConfig { - let conf = self.tenant_conf.read().unwrap(); + let conf = self.tenant_conf.load(); let location_config_mode = match conf.location.attach_mode { AttachmentMode::Single => models::LocationConfigMode::AttachedSingle, @@ -2236,7 +2229,7 @@ where impl Tenant { pub fn tenant_specific_overrides(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf.clone() + self.tenant_conf.load().tenant_conf.clone() } pub fn effective_config(&self) -> TenantConf { @@ -2245,84 +2238,84 @@ impl Tenant { } pub fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } pub fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } pub fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } pub fn get_compaction_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_period .unwrap_or(self.conf.default_tenant_conf.compaction_period) } pub fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } pub fn get_gc_horizon(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_horizon .unwrap_or(self.conf.default_tenant_conf.gc_horizon) } pub fn get_gc_period(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .gc_period .unwrap_or(self.conf.default_tenant_conf.gc_period) } pub fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } pub fn get_pitr_interval(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .pitr_interval .unwrap_or(self.conf.default_tenant_conf.pitr_interval) } pub fn get_trace_read_requests(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .trace_read_requests .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } pub fn get_min_resident_size_override(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); tenant_conf .min_resident_size_override .or(self.conf.default_tenant_conf.min_resident_size_override) } pub fn get_heatmap_period(&self) -> Option { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); let heatmap_period = tenant_conf .heatmap_period .unwrap_or(self.conf.default_tenant_conf.heatmap_period); @@ -2334,26 +2327,40 @@ impl Tenant { } pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { - self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf; - self.tenant_conf_updated(); + // Use read-copy-update in order to avoid overwriting the location config + // state if this races with [`Tenant::set_new_location_config`]. Note that + // this race is not possible if both request types come from the storage + // controller (as they should!) because an exclusive op lock is required + // on the storage controller side. + self.tenant_conf.rcu(|inner| { + Arc::new(AttachedTenantConf { + tenant_conf: new_tenant_conf.clone(), + location: inner.location, + }) + }); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) { - *self.tenant_conf.write().unwrap() = new_conf; - self.tenant_conf_updated(); + let new_tenant_conf = new_conf.tenant_conf.clone(); + + self.tenant_conf.store(Arc::new(new_conf)); + + self.tenant_conf_updated(&new_tenant_conf); // Don't hold self.timelines.lock() during the notifies. // There's no risk of deadlock right now, but there could be if we consolidate // mutexes in struct Timeline in the future. let timelines = self.list_timelines(); for timeline in timelines { - timeline.tenant_conf_updated(); + timeline.tenant_conf_updated(&new_tenant_conf); } } @@ -2367,11 +2374,8 @@ impl Tenant { .unwrap_or(psconf.default_tenant_conf.timeline_get_throttle.clone()) } - pub(crate) fn tenant_conf_updated(&self) { - let conf = { - let guard = self.tenant_conf.read().unwrap(); - Self::get_timeline_get_throttle_config(self.conf, &guard.tenant_conf) - }; + pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { + let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf); self.timeline_get_throttle.reconfigure(conf) } @@ -2519,7 +2523,7 @@ impl Tenant { Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf), &crate::metrics::tenant_throttling::TIMELINE_GET, )), - tenant_conf: Arc::new(RwLock::new(attached_conf)), + tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), } } @@ -3505,7 +3509,7 @@ impl Tenant { } pub(crate) fn get_tenant_conf(&self) -> TenantConfOpt { - self.tenant_conf.read().unwrap().tenant_conf.clone() + self.tenant_conf.load().tenant_conf.clone() } } @@ -3854,6 +3858,7 @@ mod tests { use hex_literal::hex; use pageserver_api::keyspace::KeySpace; use rand::{thread_rng, Rng}; + use tests::timeline::ShutdownMode; static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -4299,7 +4304,7 @@ mod tests { make_some_layers(tline.as_ref(), Lsn(0x8000), &ctx).await?; // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() @@ -4340,7 +4345,7 @@ mod tests { // so that all uploads finish & we can call harness.load() below again tenant - .shutdown(Default::default(), true) + .shutdown(Default::default(), ShutdownMode::FreezeAndFlush) .instrument(harness.span()) .await .ok() @@ -5121,7 +5126,7 @@ mod tests { // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again let raw_tline = tline.raw_timeline().unwrap(); raw_tline - .shutdown() + .shutdown(super::timeline::ShutdownMode::Hard) .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_shard_id, shard_id=%raw_tline.tenant_shard_id.shard_slug(), timeline_id=%TIMELINE_ID)) .await; std::mem::forget(tline); diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index 7d37873a67..d1881f3897 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -14,7 +14,10 @@ use crate::{ config::PageServerConf, context::RequestContext, task_mgr::{self, TaskKind}, - tenant::mgr::{TenantSlot, TenantsMapRemoveResult}, + tenant::{ + mgr::{TenantSlot, TenantsMapRemoveResult}, + timeline::ShutdownMode, + }, }; use super::{ @@ -463,7 +466,7 @@ impl DeleteTenantFlow { // tenant.shutdown // Its also bad that we're holding tenants.read here. // TODO relax set_stopping to be idempotent? - if tenant.shutdown(progress, false).await.is_err() { + if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() { return Err(DeleteTenantError::Other(anyhow::anyhow!( "tenant shutdown is already in progress" ))); diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index e48b9e83bd..b27230db03 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -72,6 +72,10 @@ impl EphemeralFile { self.len } + pub(crate) fn id(&self) -> page_cache::FileId { + self.page_cache_file_id + } + pub(crate) async fn read_blk( &self, blknum: u32, diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index b8ed69052f..4c4cd90c99 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -346,35 +346,6 @@ where } } -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub enum InMemoryLayerHandle { - Open { - lsn_floor: Lsn, - end_lsn: Lsn, - }, - Frozen { - idx: usize, - lsn_floor: Lsn, - end_lsn: Lsn, - }, -} - -impl InMemoryLayerHandle { - pub fn get_lsn_floor(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { lsn_floor, .. } => *lsn_floor, - InMemoryLayerHandle::Frozen { lsn_floor, .. } => *lsn_floor, - } - } - - pub fn get_end_lsn(&self) -> Lsn { - match self { - InMemoryLayerHandle::Open { end_lsn, .. } => *end_lsn, - InMemoryLayerHandle::Frozen { end_lsn, .. } => *end_lsn, - } - } -} - impl LayerMap { /// /// Find the latest layer (by lsn.end) that covers the given @@ -576,41 +547,18 @@ impl LayerMap { self.historic.iter() } - /// Get a handle for the first in memory layer that matches the provided predicate. - /// The handle should be used with [`Self::get_in_memory_layer`] to retrieve the actual layer. - /// - /// Note: [`Self::find_in_memory_layer`] and [`Self::get_in_memory_layer`] should be called during - /// the same exclusive region established by holding the layer manager lock. - pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option + /// Get a ref counted pointer for the first in memory layer that matches the provided predicate. + pub fn find_in_memory_layer(&self, mut pred: Pred) -> Option> where Pred: FnMut(&Arc) -> bool, { if let Some(open) = &self.open_layer { if pred(open) { - return Some(InMemoryLayerHandle::Open { - lsn_floor: open.get_lsn_range().start, - end_lsn: open.get_lsn_range().end, - }); + return Some(open.clone()); } } - let pos = self.frozen_layers.iter().rev().position(pred); - pos.map(|rev_idx| { - let idx = self.frozen_layers.len() - 1 - rev_idx; - InMemoryLayerHandle::Frozen { - idx, - lsn_floor: self.frozen_layers[idx].get_lsn_range().start, - end_lsn: self.frozen_layers[idx].get_lsn_range().end, - } - }) - } - - /// Get the layer pointed to by the provided handle. - pub fn get_in_memory_layer(&self, handle: &InMemoryLayerHandle) -> Option> { - match handle { - InMemoryLayerHandle::Open { .. } => self.open_layer.clone(), - InMemoryLayerHandle::Frozen { idx, .. } => self.frozen_layers.get(*idx).cloned(), - } + self.frozen_layers.iter().rfind(|l| pred(l)).cloned() } /// diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index f01fb9791c..b1b46d487b 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -44,6 +44,7 @@ use crate::tenant::config::{ use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; +use crate::tenant::timeline::ShutdownMode; use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState}; use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TEMP_FILE_SUFFIX}; @@ -783,11 +784,9 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { shutdown_state.insert(tenant_shard_id, TenantSlot::Attached(t.clone())); join_set.spawn( async move { - let freeze_and_flush = true; - let res = { let (_guard, shutdown_progress) = completion::channel(); - t.shutdown(shutdown_progress, freeze_and_flush).await + t.shutdown(shutdown_progress, ShutdownMode::FreezeAndFlush).await }; if let Err(other_progress) = res { @@ -1107,7 +1106,7 @@ impl TenantManager { }; info!("Shutting down attached tenant"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(barrier) => { info!("Shutdown already in progress, waiting for it to complete"); @@ -1223,7 +1222,7 @@ impl TenantManager { TenantSlot::Attached(tenant) => { let (_guard, progress) = utils::completion::channel(); info!("Shutting down just-spawned tenant, because tenant manager is shut down"); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { info!("Finished shutting down just-spawned tenant"); } @@ -1273,7 +1272,7 @@ impl TenantManager { }; let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, false).await { + match tenant.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => { slot_guard.drop_old_value()?; } @@ -1649,7 +1648,14 @@ impl TenantManager { fail::fail_point!("shard-split-lsn-wait", |_| Err(anyhow::anyhow!( "failpoint" ))); - if let Err(e) = timeline.wait_lsn(*target_lsn, ctx).await { + if let Err(e) = timeline + .wait_lsn( + *target_lsn, + crate::tenant::timeline::WaitLsnWaiter::Tenant, + ctx, + ) + .await + { // Failure here might mean shutdown, in any case this part is an optimization // and we shouldn't hold up the split operation. tracing::warn!( @@ -1670,7 +1676,7 @@ impl TenantManager { // Phase 5: Shut down the parent shard, and erase it from disk let (_guard, progress) = completion::channel(); - match parent.shutdown(progress, false).await { + match parent.shutdown(progress, ShutdownMode::Hard).await { Ok(()) => {} Err(other) => { other.wait().await; @@ -2657,11 +2663,11 @@ where let attached_tenant = match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { // whenever we remove a tenant from memory, we don't want to flush and wait for upload - let freeze_and_flush = false; + let shutdown_mode = ShutdownMode::Hard; // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so // that we can continue safely to cleanup. - match tenant.shutdown(progress, freeze_and_flush).await { + match tenant.shutdown(progress, shutdown_mode).await { Ok(()) => {} Err(_other) => { // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index cbd942d706..3879135f26 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -200,6 +200,7 @@ use utils::backoff::{ use std::collections::{HashMap, VecDeque}; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; +use std::time::Duration; use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use std::ops::DerefMut; @@ -207,7 +208,7 @@ use tracing::{debug, error, info, instrument, warn}; use tracing::{info_span, Instrument}; use utils::lsn::Lsn; -use crate::deletion_queue::DeletionQueueClient; +use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError}; use crate::metrics::{ MeasureRemoteOp, RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics, RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES, @@ -261,6 +262,10 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst"; /// Default buffer size when interfacing with [`tokio::fs::File`]. pub(crate) const BUFFER_SIZE: usize = 32 * 1024; +/// Doing non-essential flushes of deletion queue is subject to this timeout, after +/// which we warn and skip. +const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10); + pub enum MaybeDeletedIndexPart { IndexPart(IndexPart), Deleted(IndexPart), @@ -588,14 +593,14 @@ impl RemoteTimelineClient { upload_queue: &mut UploadQueueInitialized, metadata: TimelineMetadata, ) { + let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); + info!( - "scheduling metadata upload with {} files ({} changed)", + "scheduling metadata upload up to consistent LSN {disk_consistent_lsn} with {} files ({} changed)", upload_queue.latest_files.len(), upload_queue.latest_files_changes_since_metadata_upload_scheduled, ); - let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn(); - let index_part = IndexPart::new( upload_queue.latest_files.clone(), disk_consistent_lsn, @@ -1050,6 +1055,26 @@ impl RemoteTimelineClient { Ok(()) } + async fn flush_deletion_queue(&self) -> Result<(), DeletionQueueError> { + match tokio::time::timeout( + DELETION_QUEUE_FLUSH_TIMEOUT, + self.deletion_queue_client.flush_immediate(), + ) + .await + { + Ok(result) => result, + Err(_timeout) => { + // Flushing remote deletions is not mandatory: we flush here to make the system easier to test, and + // to ensure that _usually_ objects are really gone after a DELETE is acked. However, in case of deletion + // queue issues (https://github.com/neondatabase/neon/issues/6440), we don't want to wait indefinitely here. + tracing::warn!( + "Timed out waiting for deletion queue flush, acking deletion anyway" + ); + Ok(()) + } + } + } + /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set. /// The function deletes layer files one by one, then lists the prefix to see if we leaked something /// deletes leaked files if any and proceeds with deletion of index file at the end. @@ -1099,7 +1124,7 @@ impl RemoteTimelineClient { // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't // taking the burden of listing all the layers that we already know we should delete. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; let cancel = shutdown_token(); @@ -1173,7 +1198,7 @@ impl RemoteTimelineClient { // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait // for a flush to a persistent deletion list so that we may be sure deletion will occur. - self.deletion_queue_client.flush_immediate().await?; + self.flush_deletion_queue().await?; fail::fail_point!("timeline-delete-after-index-delete", |_| { Err(anyhow::anyhow!( @@ -1569,7 +1594,7 @@ impl RemoteTimelineClient { /// Use [`RemoteTimelineClient::shutdown`] for graceful stop. /// /// In-progress operations will still be running after this function returns. - /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))` + /// Use `task_mgr::shutdown_tasks(Some(TaskKind::RemoteUploadTask), Some(self.tenant_shard_id), Some(timeline_id))` /// to wait for them to complete, after calling this function. pub(crate) fn stop(&self) { // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 8782a9f04e..530e1a3244 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -786,6 +786,35 @@ impl<'a> TenantDownloader<'a> { // Existing on-disk layers: just update their access time. if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) { tracing::debug!("Layer {} is already on disk", layer.name); + + if cfg!(debug_assertions) { + // Debug for https://github.com/neondatabase/neon/issues/6966: check that the files we think + // are already present on disk are really there. + let local_path = self + .conf + .timeline_path(tenant_shard_id, &timeline.timeline_id) + .join(layer.name.file_name()); + match tokio::fs::metadata(&local_path).await { + Ok(meta) => { + tracing::debug!( + "Layer {} present at {}, size {}", + layer.name, + local_path, + meta.len(), + ); + } + Err(e) => { + tracing::warn!( + "Layer {} not found at {} ({})", + layer.name, + local_path, + e + ); + debug_assert!(false); + } + } + } + if on_disk.metadata != LayerFileMetadata::from(&layer.metadata) || on_disk.access_time != layer.access_time { diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index f44a92a2d7..9a2b086828 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -25,7 +25,7 @@ use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; use std::ops::Range; -use std::sync::Mutex; +use std::sync::{Arc, Mutex}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use tracing::warn; use utils::history_buffer::HistoryBufferWithDropCounter; @@ -41,8 +41,8 @@ pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey}; pub(crate) use layer::{EvictionError, Layer, ResidentLayer}; -use super::layer_map::InMemoryLayerHandle; -use super::timeline::layer_manager::LayerManager; +use self::inmemory_layer::InMemoryLayerFileId; + use super::timeline::GetVectoredError; use super::PageReconstructError; @@ -204,23 +204,30 @@ impl Default for ValuesReconstructState { } } -/// Description of layer to be read - the layer map can turn -/// this description into the actual layer. -#[derive(PartialEq, Eq, Hash, Debug, Clone)] -pub(crate) enum ReadableLayerDesc { - Persistent { - desc: PersistentLayerDesc, - lsn_range: Range, - }, - InMemory { - handle: InMemoryLayerHandle, - lsn_ceil: Lsn, - }, +/// A key that uniquely identifies a layer in a timeline +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub(crate) enum LayerId { + PersitentLayerId(PersistentLayerKey), + InMemoryLayerId(InMemoryLayerFileId), } -/// Wraper for 'ReadableLayerDesc' sorted by Lsn +/// Layer wrapper for the read path. Note that it is valid +/// to use these layers even after external operations have +/// been performed on them (compaction, freeze, etc.). #[derive(Debug)] -struct ReadableLayerDescOrdered(ReadableLayerDesc); +pub(crate) enum ReadableLayer { + PersistentLayer(Layer), + InMemoryLayer(Arc), +} + +/// A partial description of a read to be done. +#[derive(Debug, Clone)] +struct ReadDesc { + /// An id used to resolve the readable layer within the fringe + layer_id: LayerId, + /// Lsn range for the read, used for selecting the next read + lsn_range: Range, +} /// Data structure which maintains a fringe of layers for the /// read path. The fringe is the set of layers which intersects @@ -231,41 +238,64 @@ struct ReadableLayerDescOrdered(ReadableLayerDesc); /// a two layer indexing scheme. #[derive(Debug)] pub(crate) struct LayerFringe { - layers_by_lsn: BinaryHeap, - layers: HashMap, + planned_reads_by_lsn: BinaryHeap, + layers: HashMap, +} + +#[derive(Debug)] +struct LayerKeyspace { + layer: ReadableLayer, + target_keyspace: KeySpace, } impl LayerFringe { pub(crate) fn new() -> Self { LayerFringe { - layers_by_lsn: BinaryHeap::new(), + planned_reads_by_lsn: BinaryHeap::new(), layers: HashMap::new(), } } - pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayerDesc, KeySpace)> { - let handle = match self.layers_by_lsn.pop() { - Some(h) => h, + pub(crate) fn next_layer(&mut self) -> Option<(ReadableLayer, KeySpace, Range)> { + let read_desc = match self.planned_reads_by_lsn.pop() { + Some(desc) => desc, None => return None, }; - let removed = self.layers.remove_entry(&handle.0); + let removed = self.layers.remove_entry(&read_desc.layer_id); match removed { - Some((layer, keyspace)) => Some((layer, keyspace)), + Some(( + _, + LayerKeyspace { + layer, + target_keyspace, + }, + )) => Some((layer, target_keyspace, read_desc.lsn_range)), None => unreachable!("fringe internals are always consistent"), } } - pub(crate) fn update(&mut self, layer: ReadableLayerDesc, keyspace: KeySpace) { - let entry = self.layers.entry(layer.clone()); + pub(crate) fn update( + &mut self, + layer: ReadableLayer, + keyspace: KeySpace, + lsn_range: Range, + ) { + let layer_id = layer.id(); + let entry = self.layers.entry(layer_id.clone()); match entry { Entry::Occupied(mut entry) => { - entry.get_mut().merge(&keyspace); + entry.get_mut().target_keyspace.merge(&keyspace); } Entry::Vacant(entry) => { - self.layers_by_lsn - .push(ReadableLayerDescOrdered(entry.key().clone())); - entry.insert(keyspace); + self.planned_reads_by_lsn.push(ReadDesc { + lsn_range, + layer_id: layer_id.clone(), + }); + entry.insert(LayerKeyspace { + layer, + target_keyspace: keyspace, + }); } } } @@ -277,77 +307,55 @@ impl Default for LayerFringe { } } -impl Ord for ReadableLayerDescOrdered { +impl Ord for ReadDesc { fn cmp(&self, other: &Self) -> Ordering { - let ord = self.0.get_lsn_ceil().cmp(&other.0.get_lsn_ceil()); + let ord = self.lsn_range.end.cmp(&other.lsn_range.end); if ord == std::cmp::Ordering::Equal { - self.0 - .get_lsn_floor() - .cmp(&other.0.get_lsn_floor()) - .reverse() + self.lsn_range.start.cmp(&other.lsn_range.start).reverse() } else { ord } } } -impl PartialOrd for ReadableLayerDescOrdered { +impl PartialOrd for ReadDesc { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl PartialEq for ReadableLayerDescOrdered { +impl PartialEq for ReadDesc { fn eq(&self, other: &Self) -> bool { - self.0.get_lsn_floor() == other.0.get_lsn_floor() - && self.0.get_lsn_ceil() == other.0.get_lsn_ceil() + self.lsn_range == other.lsn_range } } -impl Eq for ReadableLayerDescOrdered {} +impl Eq for ReadDesc {} -impl ReadableLayerDesc { - pub(crate) fn get_lsn_floor(&self) -> Lsn { +impl ReadableLayer { + pub(crate) fn id(&self) -> LayerId { match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.start, - ReadableLayerDesc::InMemory { handle, .. } => handle.get_lsn_floor(), - } - } - - pub(crate) fn get_lsn_ceil(&self) -> Lsn { - match self { - ReadableLayerDesc::Persistent { lsn_range, .. } => lsn_range.end, - ReadableLayerDesc::InMemory { lsn_ceil, .. } => *lsn_ceil, + Self::PersistentLayer(layer) => LayerId::PersitentLayerId(layer.layer_desc().key()), + Self::InMemoryLayer(layer) => LayerId::InMemoryLayerId(layer.file_id()), } } pub(crate) async fn get_values_reconstruct_data( &self, - layer_manager: &LayerManager, keyspace: KeySpace, + lsn_range: Range, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { match self { - ReadableLayerDesc::Persistent { desc, lsn_range } => { - let layer = layer_manager.get_from_desc(desc); + ReadableLayer::PersistentLayer(layer) => { layer - .get_values_reconstruct_data( - keyspace, - lsn_range.clone(), - reconstruct_state, - ctx, - ) + .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx) .await } - ReadableLayerDesc::InMemory { handle, lsn_ceil } => { - let layer = layer_manager - .layer_map() - .get_in_memory_layer(handle) - .unwrap(); - + ReadableLayer::InMemoryLayer(layer) => { layer - .get_values_reconstruct_data(keyspace, *lsn_ceil, reconstruct_state, ctx) + .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx) .await } } diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 628f12065f..43942ba2db 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -12,7 +12,7 @@ use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; -use crate::walrecord; +use crate::{page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; @@ -36,10 +36,14 @@ use super::{ ValuesReconstructState, }; +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] +pub(crate) struct InMemoryLayerFileId(page_cache::FileId); + pub struct InMemoryLayer { conf: &'static PageServerConf, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + file_id: InMemoryLayerFileId, /// This layer contains all the changes from 'start_lsn'. The /// start is inclusive. @@ -200,6 +204,10 @@ pub(crate) static GLOBAL_RESOURCES: GlobalResources = GlobalResources { }; impl InMemoryLayer { + pub(crate) fn file_id(&self) -> InMemoryLayerFileId { + self.file_id + } + pub(crate) fn get_timeline_id(&self) -> TimelineId { self.timeline_id } @@ -443,8 +451,10 @@ impl InMemoryLayer { trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}"); let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?; + let key = InMemoryLayerFileId(file.id()); Ok(InMemoryLayer { + file_id: key, conf, timeline_id, tenant_shard_id, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f3565c1fb3..d3c8c5f66c 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -9,6 +9,7 @@ pub mod uninit; mod walreceiver; use anyhow::{anyhow, bail, ensure, Context, Result}; +use arc_swap::ArcSwap; use bytes::Bytes; use camino::Utf8Path; use enumset::EnumSet; @@ -118,11 +119,11 @@ use self::layer_manager::LayerManager; use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; -use super::remote_timeline_client::RemoteTimelineClient; +use super::config::TenantConf; use super::secondary::heatmap::{HeatMapLayer, HeatMapTimeline}; -use super::{config::TenantConf, storage_layer::ReadableLayerDesc}; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; +use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer}; #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum FlushLoopState { @@ -183,7 +184,7 @@ pub(crate) struct AuxFilesState { pub struct Timeline { conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, myself: Weak, @@ -281,10 +282,12 @@ pub struct Timeline { pub(super) flush_loop_state: Mutex, /// layer_flush_start_tx can be used to wake up the layer-flushing task. - /// The value is a counter, incremented every time a new flush cycle is requested. - /// The flush cycle counter is sent back on the layer_flush_done channel when - /// the flush finishes. You can use that to wait for the flush to finish. - layer_flush_start_tx: tokio::sync::watch::Sender, + /// - The u64 value is a counter, incremented every time a new flush cycle is requested. + /// The flush cycle counter is sent back on the layer_flush_done channel when + /// the flush finishes. You can use that to wait for the flush to finish. + /// - The LSN is updated to max() of its current value and the latest disk_consistent_lsn + /// read by whoever sends an update + layer_flush_start_tx: tokio::sync::watch::Sender<(u64, Lsn)>, /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>, @@ -612,6 +615,25 @@ pub enum GetVectoredImpl { Vectored, } +pub(crate) enum WaitLsnWaiter<'a> { + Timeline(&'a Timeline), + Tenant, + PageService, +} + +/// Argument to [`Timeline::shutdown`]. +#[derive(Debug, Clone, Copy)] +pub(crate) enum ShutdownMode { + /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then + /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// + /// While we are flushing, we continue to accept read I/O for LSNs ingested before + /// the call to [`Timeline::shutdown`]. + FreezeAndFlush, + /// Shut down immediately, without waiting for any open layers to flush. + Hard, +} + /// Public interface functions impl Timeline { /// Get the LSN where this branch was created @@ -1060,7 +1082,8 @@ impl Timeline { pub(crate) async fn wait_lsn( &self, lsn: Lsn, - _ctx: &RequestContext, /* Prepare for use by cancellation */ + who_is_waiting: WaitLsnWaiter<'_>, + ctx: &RequestContext, /* Prepare for use by cancellation */ ) -> Result<(), WaitLsnError> { if self.cancel.is_cancelled() { return Err(WaitLsnError::Shutdown); @@ -1068,20 +1091,28 @@ impl Timeline { return Err(WaitLsnError::BadState); } - // This should never be called from the WAL receiver, because that could lead - // to a deadlock. - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler), - "wait_lsn cannot be called in WAL receiver" - ); - debug_assert!( - task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller), - "wait_lsn cannot be called in WAL receiver" - ); + if cfg!(debug_assertions) { + match ctx.task_kind() { + TaskKind::WalReceiverManager + | TaskKind::WalReceiverConnectionHandler + | TaskKind::WalReceiverConnectionPoller => { + let is_myself = match who_is_waiting { + WaitLsnWaiter::Timeline(waiter) => Weak::ptr_eq(&waiter.myself, &self.myself), + WaitLsnWaiter::Tenant | WaitLsnWaiter::PageService => unreachable!("tenant or page_service context are not expected to have task kind {:?}", ctx.task_kind()), + }; + if is_myself { + if let Err(current) = self.last_record_lsn.would_wait_for(lsn) { + // walingest is the only one that can advance last_record_lsn; it should make sure to never reach here + panic!("this timeline's walingest task is calling wait_lsn({lsn}) but we only have last_record_lsn={current}; would deadlock"); + } + } else { + // if another timeline's is waiting for us, there's no deadlock risk because + // our walreceiver task can make progress independent of theirs + } + } + _ => {} + } + } let _timer = crate::metrics::WAIT_LSN_TIME.start_timer(); @@ -1140,8 +1171,8 @@ impl Timeline { /// Flush to disk all data that was written with the put_* functions #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))] pub(crate) async fn freeze_and_flush(&self) -> anyhow::Result<()> { - self.freeze_inmem_layer(false).await; - self.flush_frozen_layers_and_wait().await + let to_lsn = self.freeze_inmem_layer(false).await; + self.flush_frozen_layers_and_wait(to_lsn).await } /// If there is no writer, and conditions for rolling the latest layer are met, then freeze it. @@ -1161,7 +1192,39 @@ impl Timeline { }; let Some(open_layer) = &layers_guard.layer_map().open_layer else { - // No open layer, no work to do. + // If there is no open layer, we have no layer freezing to do. However, we might need to generate + // some updates to disk_consistent_lsn and remote_consistent_lsn, in case we ingested some WAL regions + // that didn't result in writes to this shard. + + // Must not hold the layers lock while waiting for a flush. + drop(layers_guard); + + let last_record_lsn = self.get_last_record_lsn(); + let disk_consistent_lsn = self.get_disk_consistent_lsn(); + if last_record_lsn > disk_consistent_lsn { + // We have no open layer, but disk_consistent_lsn is behind the last record: this indicates + // we are a sharded tenant and have skipped some WAL + let last_freeze_ts = *self.last_freeze_ts.read().unwrap(); + if last_freeze_ts.elapsed() >= self.get_checkpoint_timeout() { + // This should be somewhat rare, so we log it at INFO level. + // + // We checked for checkpoint timeout so that a shard without any + // data ingested (yet) doesn't write a remote index as soon as it + // sees its LSN advance: we only do this if we've been layer-less + // for some time. + tracing::info!( + "Advancing disk_consistent_lsn past WAL ingest gap {} -> {}", + disk_consistent_lsn, + last_record_lsn + ); + + // The flush loop will update remote consistent LSN as well as disk consistent LSN. + self.flush_frozen_layers_and_wait(last_record_lsn) + .await + .ok(); + } + } + return; }; @@ -1290,83 +1353,119 @@ impl Timeline { self.launch_eviction_task(parent, background_jobs_can_start); } - /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then - /// also to remote storage. This method can easily take multiple seconds for a busy timeline. + /// After this function returns, there are no timeline-scoped tasks are left running. /// - /// While we are flushing, we continue to accept read I/O. - pub(crate) async fn flush_and_shutdown(&self) { + /// The preferred pattern for is: + /// - in any spawned tasks, keep Timeline::guard open + Timeline::cancel / child token + /// - if early shutdown (not just cancellation) of a sub-tree of tasks is required, + /// go the extra mile and keep track of JoinHandles + /// - Keep track of JoinHandles using a passed-down `Arc>>` or similar, + /// instead of spawning directly on a runtime. It is a more composable / testable pattern. + /// + /// For legacy reasons, we still have multiple tasks spawned using + /// `task_mgr::spawn(X, Some(tenant_id), Some(timeline_id))`. + /// We refer to these as "timeline-scoped task_mgr tasks". + /// Some of these tasks are already sensitive to Timeline::cancel while others are + /// not sensitive to Timeline::cancel and instead respect [`task_mgr::shutdown_token`] + /// or [`task_mgr::shutdown_watcher`]. + /// We want to gradually convert the code base away from these. + /// + /// Here is an inventory of timeline-scoped task_mgr tasks that are still sensitive to + /// `task_mgr::shutdown_{token,watcher}` (there are also tenant-scoped and global-scoped + /// ones that aren't mentioned here): + /// - [`TaskKind::TimelineDeletionWorker`] + /// - NB: also used for tenant deletion + /// - [`TaskKind::RemoteUploadTask`]` + /// - [`TaskKind::InitialLogicalSizeCalculation`] + /// - [`TaskKind::DownloadAllRemoteLayers`] (can we get rid of it?) + // Inventory of timeline-scoped task_mgr tasks that use spawn but aren't sensitive: + /// - [`TaskKind::Eviction`] + /// - [`TaskKind::LayerFlushTask`] + /// - [`TaskKind::OndemandLogicalSizeCalculation`] + /// - [`TaskKind::GarbageCollector`] (immediate_gc is timeline-scoped) + pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); - // Stop ingesting data, so that we are not still writing to an InMemoryLayer while - // trying to flush - tracing::debug!("Waiting for WalReceiverManager..."); - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + let try_freeze_and_flush = match mode { + ShutdownMode::FreezeAndFlush => true, + ShutdownMode::Hard => false, + }; - // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance + // Regardless of whether we're going to try_freeze_and_flush + // or not, stop ingesting any more data. Walreceiver only provides + // cancellation but no "wait until gone", because it uses the Timeline::gate. + // So, only after the self.gate.close() below will we know for sure that + // no walreceiver tasks are left. + // For `try_freeze_and_flush=true`, this means that we might still be ingesting + // data during the call to `self.freeze_and_flush()` below. + // That's not ideal, but, we don't have the concept of a ChildGuard, + // which is what we'd need to properly model early shutdown of the walreceiver + // task sub-tree before the other Timeline task sub-trees. + let walreceiver = self.walreceiver.lock().unwrap().take(); + tracing::debug!( + is_some = walreceiver.is_some(), + "Waiting for WalReceiverManager..." + ); + if let Some(walreceiver) = walreceiver { + walreceiver.cancel(); + } + // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); - // now all writers to InMemory layer are gone, do the final flush if requested - match self.freeze_and_flush().await { - Ok(_) => { - // drain the upload queue - if let Some(client) = self.remote_client.as_ref() { - // if we did not wait for completion here, it might be our shutdown process - // didn't wait for remote uploads to complete at all, as new tasks can forever - // be spawned. - // - // what is problematic is the shutting down of RemoteTimelineClient, because - // obviously it does not make sense to stop while we wait for it, but what - // about corner cases like s3 suddenly hanging up? - client.shutdown().await; + if try_freeze_and_flush { + // we shut down walreceiver above, so, we won't add anything more + // to the InMemoryLayer; freeze it and wait for all frozen layers + // to reach the disk & upload queue, then shut the upload queue and + // wait for it to drain. + match self.freeze_and_flush().await { + Ok(_) => { + // drain the upload queue + if let Some(client) = self.remote_client.as_ref() { + // if we did not wait for completion here, it might be our shutdown process + // didn't wait for remote uploads to complete at all, as new tasks can forever + // be spawned. + // + // what is problematic is the shutting down of RemoteTimelineClient, because + // obviously it does not make sense to stop while we wait for it, but what + // about corner cases like s3 suddenly hanging up? + client.shutdown().await; + } + } + Err(e) => { + // Non-fatal. Shutdown is infallible. Failures to flush just mean that + // we have some extra WAL replay to do next time the timeline starts. + warn!("failed to freeze and flush: {e:#}"); } } - Err(e) => { - // Non-fatal. Shutdown is infallible. Failures to flush just mean that - // we have some extra WAL replay to do next time the timeline starts. - warn!("failed to freeze and flush: {e:#}"); - } } - self.shutdown().await; - } - - /// Shut down immediately, without waiting for any open layers to flush to disk. This is a subset of - /// the graceful [`Timeline::flush_and_shutdown`] function. - pub(crate) async fn shutdown(&self) { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Signal any subscribers to our cancellation token to drop out tracing::debug!("Cancelling CancellationToken"); self.cancel.cancel(); - // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel - // while doing so. - self.last_record_lsn.shutdown(); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; - - // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in - // case our caller wants to use that for a deletion + // Transition the remote_client into a state where it's only useful for timeline deletion. + // (The deletion use case is why we can't just hook up remote_client to Self::cancel).) if let Some(remote_client) = self.remote_client.as_ref() { remote_client.stop(); + // As documented in remote_client.stop()'s doc comment, it's our responsibility + // to shut down the upload queue tasks. + // TODO: fix that, task management should be encapsulated inside remote_client. + task_mgr::shutdown_tasks( + Some(TaskKind::RemoteUploadTask), + Some(self.tenant_shard_id), + Some(self.timeline_id), + ) + .await; } + // TODO: work toward making this a no-op. See this funciton's doc comment for more context. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; - // Finally wait until any gate-holders are complete + // Finally wait until any gate-holders are complete. + // + // TODO: once above shutdown_tasks is a no-op, we can close the gate before calling shutdown_tasks + // and use a TBD variant of shutdown_tasks that asserts that there were no tasks left. self.gate.close().await; self.metrics.shutdown(); @@ -1570,57 +1669,65 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10; // Private functions impl Timeline { pub(crate) fn get_lazy_slru_download(&self) -> bool { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .lazy_slru_download .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download) } fn get_checkpoint_distance(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_distance .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance) } fn get_checkpoint_timeout(&self) -> Duration { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .checkpoint_timeout .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout) } fn get_compaction_target_size(&self) -> u64 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_target_size .unwrap_or(self.conf.default_tenant_conf.compaction_target_size) } fn get_compaction_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_threshold .unwrap_or(self.conf.default_tenant_conf.compaction_threshold) } fn get_image_creation_threshold(&self) -> usize { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .image_creation_threshold .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold) } fn get_compaction_algorithm(&self) -> CompactionAlgorithm { - let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf; + let tenant_conf = &self.tenant_conf.load(); tenant_conf + .tenant_conf .compaction_algorithm .unwrap_or(self.conf.default_tenant_conf.compaction_algorithm) } fn get_eviction_policy(&self) -> EvictionPolicy { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); + let tenant_conf = self.tenant_conf.load(); tenant_conf + .tenant_conf .eviction_policy .unwrap_or(self.conf.default_tenant_conf.eviction_policy) } @@ -1635,22 +1742,25 @@ impl Timeline { } fn get_image_layer_creation_check_threshold(&self) -> u8 { - let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone(); - tenant_conf.image_layer_creation_check_threshold.unwrap_or( - self.conf - .default_tenant_conf - .image_layer_creation_check_threshold, - ) + let tenant_conf = self.tenant_conf.load(); + tenant_conf + .tenant_conf + .image_layer_creation_check_threshold + .unwrap_or( + self.conf + .default_tenant_conf + .image_layer_creation_check_threshold, + ) } - pub(super) fn tenant_conf_updated(&self) { + pub(super) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) { // NB: Most tenant conf options are read by background loops, so, // changes will automatically be picked up. // The threshold is embedded in the metric. So, we need to update it. { let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold( - &self.tenant_conf.read().unwrap().tenant_conf, + new_conf, &self.conf.default_tenant_conf, ); @@ -1677,7 +1787,7 @@ impl Timeline { #[allow(clippy::too_many_arguments)] pub(super) fn new( conf: &'static PageServerConf, - tenant_conf: Arc>, + tenant_conf: Arc>, metadata: &TimelineMetadata, ancestor: Option>, timeline_id: TimelineId, @@ -1693,17 +1803,16 @@ impl Timeline { let disk_consistent_lsn = metadata.disk_consistent_lsn(); let (state, _) = watch::channel(state); - let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); + let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn)); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); - let tenant_conf_guard = tenant_conf.read().unwrap(); - - let evictions_low_residence_duration_metric_threshold = + let evictions_low_residence_duration_metric_threshold = { + let loaded_tenant_conf = tenant_conf.load(); Self::get_evictions_low_residence_duration_metric_threshold( - &tenant_conf_guard.tenant_conf, + &loaded_tenant_conf.tenant_conf, &conf.default_tenant_conf, - ); - drop(tenant_conf_guard); + ) + }; Arc::new_cyclic(|myself| { let mut result = Timeline { @@ -1886,20 +1995,19 @@ impl Timeline { self.timeline_id, self.tenant_shard_id ); - let tenant_conf_guard = self.tenant_conf.read().unwrap(); - let wal_connect_timeout = tenant_conf_guard + let tenant_conf = self.tenant_conf.load(); + let wal_connect_timeout = tenant_conf .tenant_conf .walreceiver_connect_timeout .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout); - let lagging_wal_timeout = tenant_conf_guard + let lagging_wal_timeout = tenant_conf .tenant_conf .lagging_wal_timeout .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout); - let max_lsn_wal_lag = tenant_conf_guard + let max_lsn_wal_lag = tenant_conf .tenant_conf .max_lsn_wal_lag .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag); - drop(tenant_conf_guard); let mut guard = self.walreceiver.lock().unwrap(); assert!( @@ -2447,10 +2555,6 @@ impl Timeline { debug!("cancelling logical size calculation for timeline shutdown"); calculation.await } - _ = task_mgr::shutdown_watcher() => { - debug!("cancelling logical size calculation for task shutdown"); - calculation.await - } } } @@ -2905,16 +3009,6 @@ impl Timeline { let mut completed_keyspace = KeySpace::default(); - // Hold the layer map whilst visiting the timeline to prevent - // compaction, eviction and flushes from rendering the layers unreadable. - // - // TODO: Do we actually need to do this? In theory holding on - // to [`tenant::storage_layer::Layer`] should be enough. However, - // [`Timeline::get`] also holds the lock during IO, so more investigation - // is needed. - let guard = timeline.layers.read().await; - let layers = guard.layer_map(); - loop { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -2924,6 +3018,9 @@ impl Timeline { unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); completed_keyspace.merge(&keys_done_last_step); + let guard = timeline.layers.read().await; + let layers = guard.layer_map(); + let in_memory_layer = layers.find_in_memory_layer(|l| { let start_lsn = l.get_lsn_range().start; cont_lsn > start_lsn @@ -2931,12 +3028,11 @@ impl Timeline { match in_memory_layer { Some(l) => { + let lsn_range = l.get_lsn_range().start..cont_lsn; fringe.update( - ReadableLayerDesc::InMemory { - handle: l, - lsn_ceil: cont_lsn, - }, + ReadableLayer::InMemoryLayer(l), unmapped_keyspace.clone(), + lsn_range, ); } None => { @@ -2948,30 +3044,43 @@ impl Timeline { .into_iter() .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { ( - ReadableLayerDesc::Persistent { - desc: (*layer).clone(), - lsn_range: lsn_floor..cont_lsn, - }, + ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)), keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, ) }) - .for_each(|(layer, keyspace)| fringe.update(layer, keyspace)); + .for_each(|(layer, keyspace, lsn_range)| { + fringe.update(layer, keyspace, lsn_range) + }); } } } - if let Some((layer_to_read, keyspace_to_read)) = fringe.next_layer() { + // It's safe to drop the layer map lock after planning the next round of reads. + // The fringe keeps readable handles for the layers which are safe to read even + // if layers were compacted or flushed. + // + // The more interesting consideration is: "Why is the read algorithm still correct + // if the layer map changes while it is operating?". Doing a vectored read on a + // timeline boils down to pushing an imaginary lsn boundary downwards for each range + // covered by the read. The layer map tells us how to move the lsn downwards for a + // range at *a particular point in time*. It is fine for the answer to be different + // at two different time points. + drop(guard); + + if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { + let next_cont_lsn = lsn_range.start; layer_to_read .get_values_reconstruct_data( - &guard, keyspace_to_read.clone(), + lsn_range, reconstruct_state, ctx, ) .await?; unmapped_keyspace = keyspace_to_read; - cont_lsn = layer_to_read.get_lsn_floor(); + cont_lsn = next_cont_lsn; } else { break; } @@ -3049,7 +3158,7 @@ impl Timeline { } } ancestor - .wait_lsn(self.ancestor_lsn, ctx) + .wait_lsn(self.ancestor_lsn, WaitLsnWaiter::Timeline(self), ctx) .await .map_err(|e| match e { e @ WaitLsnError::Timeout(_) => GetReadyAncestorError::AncestorLsnTimeout(e), @@ -3099,7 +3208,9 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - async fn freeze_inmem_layer(&self, write_lock_held: bool) { + /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn + /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive). + async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn { // Freeze the current open in-memory layer. It will be written to disk on next // iteration. @@ -3109,7 +3220,9 @@ impl Timeline { Some(self.write_lock.lock().await) }; - self.freeze_inmem_layer_at(self.get_last_record_lsn()).await; + let to_lsn = self.get_last_record_lsn(); + self.freeze_inmem_layer_at(to_lsn).await; + to_lsn } async fn freeze_inmem_layer_at(&self, at: Lsn) { @@ -3122,25 +3235,24 @@ impl Timeline { /// Layer flusher task's main loop. async fn flush_loop( self: &Arc, - mut layer_flush_start_rx: tokio::sync::watch::Receiver, + mut layer_flush_start_rx: tokio::sync::watch::Receiver<(u64, Lsn)>, ctx: &RequestContext, ) { info!("started flush loop"); loop { tokio::select! { _ = self.cancel.cancelled() => { - info!("shutting down layer flush task"); - break; - }, - _ = task_mgr::shutdown_watcher() => { - info!("shutting down layer flush task"); + info!("shutting down layer flush task due to Timeline::cancel"); break; }, _ = layer_flush_start_rx.changed() => {} } - trace!("waking up"); - let flush_counter = *layer_flush_start_rx.borrow(); + let (flush_counter, frozen_to_lsn) = *layer_flush_start_rx.borrow(); + + // The highest LSN to which we flushed in the loop over frozen layers + let mut flushed_to_lsn = Lsn(0); + let result = loop { if self.cancel.is_cancelled() { info!("dropping out of flush loop for timeline shutdown"); @@ -3161,7 +3273,9 @@ impl Timeline { break Ok(()); }; match self.flush_frozen_layer(layer_to_flush, ctx).await { - Ok(()) => {} + Ok(this_layer_to_lsn) => { + flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn); + } Err(FlushLayerError::Cancelled) => { info!("dropping out of flush loop for timeline shutdown"); return; @@ -3170,11 +3284,36 @@ impl Timeline { FlushLayerError::Other(_) | FlushLayerError::CreateImageLayersError(_), ) => { error!("could not flush frozen layer: {err:?}"); - break err; + break err.map(|_| ()); } } timer.stop_and_record(); }; + + // Unsharded tenants should never advance their LSN beyond the end of the + // highest layer they write: such gaps between layer data and the frozen LSN + // are only legal on sharded tenants. + debug_assert!( + self.shard_identity.count.count() > 1 + || flushed_to_lsn >= frozen_to_lsn + || !flushed_to_lsn.is_valid() + ); + + if flushed_to_lsn < frozen_to_lsn && self.shard_identity.count.count() > 1 { + // If our layer flushes didn't carry disk_consistent_lsn up to the `to_lsn` advertised + // to us via layer_flush_start_rx, then advance it here. + // + // This path is only taken for tenants with multiple shards: single sharded tenants should + // never encounter a gap in the wal. + let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); + tracing::debug!("Advancing disk_consistent_lsn across layer gap {old_disk_consistent_lsn}->{frozen_to_lsn}"); + if self.set_disk_consistent_lsn(frozen_to_lsn) { + if let Err(e) = self.schedule_uploads(frozen_to_lsn, vec![]) { + tracing::warn!("Failed to schedule metadata upload after updating disk_consistent_lsn: {e}"); + } + } + } + // Notify any listeners that we're done let _ = self .layer_flush_done_tx @@ -3182,7 +3321,13 @@ impl Timeline { } } - async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> { + /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk. + /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`]. + /// + /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case, + /// it means no data will be written between the top of the highest frozen layer and to_lsn, + /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL. + async fn flush_frozen_layers_and_wait(&self, last_record_lsn: Lsn) -> anyhow::Result<()> { let mut rx = self.layer_flush_done_tx.subscribe(); // Increment the flush cycle counter and wake up the flush task. @@ -3196,9 +3341,10 @@ impl Timeline { anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}") } - self.layer_flush_start_tx.send_modify(|counter| { + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; + *lsn = std::cmp::max(last_record_lsn, *lsn); }); loop { @@ -3235,16 +3381,22 @@ impl Timeline { } fn flush_frozen_layers(&self) { - self.layer_flush_start_tx.send_modify(|val| *val += 1); + self.layer_flush_start_tx.send_modify(|(counter, lsn)| { + *counter += 1; + + *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1)); + }); } /// Flush one frozen in-memory layer to disk, as a new delta layer. + /// + /// Return value is the last lsn (inclusive) of the layer that was frozen. #[instrument(skip_all, fields(layer=%frozen_layer))] async fn flush_frozen_layer( self: &Arc, frozen_layer: Arc, ctx: &RequestContext, - ) -> Result<(), FlushLayerError> { + ) -> Result { debug_assert_current_span_has_tenant_and_timeline_id(); // As a special case, when we have just imported an image into the repository, @@ -3319,7 +3471,6 @@ impl Timeline { } let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1); - let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); // The new on-disk layers are now in the layer map. We can remove the // in-memory layer from the map now. The flushed layer is stored in @@ -3333,10 +3484,7 @@ impl Timeline { guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics); - if disk_consistent_lsn != old_disk_consistent_lsn { - assert!(disk_consistent_lsn > old_disk_consistent_lsn); - self.disk_consistent_lsn.store(disk_consistent_lsn); - + if self.set_disk_consistent_lsn(disk_consistent_lsn) { // Schedule remote uploads that will reflect our new disk_consistent_lsn self.schedule_uploads(disk_consistent_lsn, layers_to_upload)?; } @@ -3353,7 +3501,22 @@ impl Timeline { // This failpoint is used by another test case `test_pageserver_recovery`. fail_point!("flush-frozen-exit"); - Ok(()) + Ok(Lsn(lsn_range.end.0 - 1)) + } + + /// Return true if the value changed + /// + /// This function must only be used from the layer flush task, and may not be called concurrently. + fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool { + // We do a simple load/store cycle: that's why this function isn't safe for concurrent use. + let old_value = self.disk_consistent_lsn.load(); + if new_value != old_value { + assert!(new_value >= old_value); + self.disk_consistent_lsn.store(new_value); + true + } else { + false + } } /// Update metadata file @@ -3873,6 +4036,24 @@ impl Timeline { Ok(()) } + /// Schedules the uploads of the given image layers + fn upload_new_image_layers( + self: &Arc, + new_images: impl IntoIterator, + ) -> anyhow::Result<()> { + let Some(remote_client) = &self.remote_client else { + return Ok(()); + }; + for layer in new_images { + remote_client.schedule_layer_file_upload(layer)?; + } + // should any new image layer been created, not uploading index_part will + // result in a mismatch between remote_physical_size and layermap calculated + // size, which will fail some tests, but should not be an issue otherwise. + remote_client.schedule_index_upload_for_file_changes()?; + Ok(()) + } + /// Update information about which layer files need to be retained on /// garbage collection. This is separate from actually performing the GC, /// and is updated more frequently, so that compaction can remove obsolete diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 74b75dabf0..ab001bf10d 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -125,18 +125,8 @@ impl Timeline { ) .await .map_err(anyhow::Error::from)?; - if let Some(remote_client) = &self.remote_client { - for layer in layers { - remote_client.schedule_layer_file_upload(layer)?; - } - } - if let Some(remote_client) = &self.remote_client { - // should any new image layer been created, not uploading index_part will - // result in a mismatch between remote_physical_size and layermap calculated - // size, which will fail some tests, but should not be an issue otherwise. - remote_client.schedule_index_upload_for_file_changes()?; - } + self.upload_new_image_layers(layers)?; } Err(err) => { // no partitioning? This is normal, if the timeline was just created @@ -818,7 +808,10 @@ impl TimelineAdaptor { self.timeline .finish_compact_batch(&self.new_deltas, &self.new_images, &layers_to_delete) .await?; - self.new_images.clear(); + + self.timeline + .upload_new_image_layers(std::mem::take(&mut self.new_images))?; + self.new_deltas.clear(); self.layers_to_delete.clear(); Ok(()) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index ab0a88c764..af10c1c84b 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::Context; use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; -use tracing::{debug, error, info, instrument, Instrument}; +use tracing::{error, info, instrument, Instrument}; use utils::{crashsafe, fs_ext, id::TimelineId}; use crate::{ @@ -14,7 +14,6 @@ use crate::{ deletion_queue::DeletionQueueClient, task_mgr::{self, TaskKind}, tenant::{ - debug_assert_current_span_has_tenant_and_timeline_id, metadata::TimelineMetadata, remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, CreateTimelineCause, DeleteTimelineError, Tenant, @@ -23,58 +22,6 @@ use crate::{ use super::{Timeline, TimelineResources}; -/// Now that the Timeline is in Stopping state, request all the related tasks to shut down. -async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - // Notify any timeline work to drop out of loops/requests - tracing::debug!("Cancelling CancellationToken"); - timeline.cancel.cancel(); - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; - if let Some(walreceiver) = maybe_started_walreceiver { - walreceiver.stop().await; - } - debug!("wal receiver shutdown confirmed"); - - // Shut down the layer flush task before the remote client, as one depends on the other - task_mgr::shutdown_tasks( - Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - remote_client.stop(); - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks( - None, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - ) - .await; - - fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { - Err(anyhow::anyhow!( - "failpoint: timeline-delete-before-index-deleted-at" - ))? - }); - - tracing::debug!("Waiting for gate..."); - timeline.gate.close().await; - tracing::debug!("Shutdown complete"); - - Ok(()) -} - /// Mark timeline as deleted in S3 so we won't pick it up next time /// during attach or pageserver restart. /// See comment in persist_index_part_with_deleted_flag. @@ -268,7 +215,14 @@ impl DeleteTimelineFlow { guard.mark_in_progress()?; - stop_tasks(&timeline).await?; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-deleted-at" + ))? + }); set_deleted_in_remote_index(&timeline).await?; diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index ebcd70bd39..522c5b57de 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -67,20 +67,19 @@ impl Timeline { ), false, async move { - let cancel = task_mgr::shutdown_token(); tokio::select! { - _ = cancel.cancelled() => { return Ok(()); } + _ = self_clone.cancel.cancelled() => { return Ok(()); } _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {} }; - self_clone.eviction_task(parent, cancel).await; + self_clone.eviction_task(parent).await; Ok(()) }, ); } #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))] - async fn eviction_task(self: Arc, tenant: Arc, cancel: CancellationToken) { + async fn eviction_task(self: Arc, tenant: Arc) { use crate::tenant::tasks::random_init_delay; // acquire the gate guard only once within a useful span @@ -95,7 +94,7 @@ impl Timeline { EvictionPolicy::OnlyImitiate(lat) => lat.period, EvictionPolicy::NoEviction => Duration::from_secs(10), }; - if random_init_delay(period, &cancel).await.is_err() { + if random_init_delay(period, &self.cancel).await.is_err() { return; } } @@ -104,13 +103,13 @@ impl Timeline { loop { let policy = self.get_eviction_policy(); let cf = self - .eviction_iteration(&tenant, &policy, &cancel, &guard, &ctx) + .eviction_iteration(&tenant, &policy, &self.cancel, &guard, &ctx) .await; match cf { ControlFlow::Break(()) => break, ControlFlow::Continue(sleep_until) => { - if tokio::time::timeout_at(sleep_until, cancel.cancelled()) + if tokio::time::timeout_at(sleep_until, self.cancel.cancelled()) .await .is_ok() { diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index d54dc1642c..64edcc5e40 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -120,9 +120,10 @@ impl LayerManager { /// Called from `freeze_inmem_layer`, returns true if successfully frozen. pub(crate) async fn try_freeze_in_memory_layer( &mut self, - Lsn(last_record_lsn): Lsn, + lsn: Lsn, last_freeze_at: &AtomicLsn, ) { + let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); if let Some(open_layer) = &self.layer_map.open_layer { @@ -135,8 +136,11 @@ impl LayerManager { self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); - last_freeze_at.store(end_lsn); } + + // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this + // accounts for regions in the LSN range where we might have ingested no data due to sharding. + last_freeze_at.store(end_lsn); } /// Add image layers to the layer map, called from `create_image_layers`. diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index f1b62067f9..a085154a5a 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -24,13 +24,12 @@ mod connection_manager; mod walreceiver_connection; use crate::context::{DownloadBehavior, RequestContext}; -use crate::task_mgr::{self, TaskKind, WALRECEIVER_RUNTIME}; +use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME}; use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; -use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; use std::sync::Arc; @@ -40,8 +39,6 @@ use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TimelineId; - use self::connection_manager::ConnectionManagerStatus; use super::Timeline; @@ -60,9 +57,10 @@ pub struct WalReceiverConf { } pub struct WalReceiver { - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, manager_status: Arc>>, + /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. + /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. + cancel: CancellationToken, } impl WalReceiver { @@ -76,23 +74,23 @@ impl WalReceiver { let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); - let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverManager, - Some(timeline.tenant_shard_id), - Some(timeline_id), - &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), - false, + let cancel = timeline.cancel.child_token(); + WALRECEIVER_RUNTIME.spawn({ + let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); + // acquire timeline gate so we know the task doesn't outlive the Timeline + let Ok(_guard) = timeline.gate.enter() else { + debug!("WAL receiver manager could not enter the gate timeline gate, it's closed already"); + return; + }; debug!("WAL receiver manager started, connecting to broker"); - let cancel = task_mgr::shutdown_token(); let mut connection_manager_state = ConnectionManagerState::new( timeline, conf, + cancel.clone(), ); while !cancel.is_cancelled() { let loop_step_result = connection_manager_loop_step( @@ -112,25 +110,22 @@ impl WalReceiver { } connection_manager_state.shutdown().await; *loop_status.write().unwrap() = None; - Ok(()) + debug!("task exits"); } .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) - ); + }); Self { - tenant_shard_id, - timeline_id, manager_status, + cancel, } } - pub async fn stop(self) { - task_mgr::shutdown_tasks( - Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id), - Some(self.timeline_id), - ) - .await; + #[instrument(skip_all, level = tracing::Level::DEBUG)] + pub fn cancel(&self) { + debug_assert_current_span_has_tenant_and_timeline_id(); + debug!("cancelling walreceiver tasks"); + self.cancel.cancel(); } pub(crate) fn status(&self) -> Option { @@ -164,14 +159,18 @@ enum TaskStateUpdate { impl TaskHandle { /// Initializes the task, starting it immediately after the creation. + /// + /// The second argument to `task` is a child token of `cancel_parent` ([`CancellationToken::child_token`]). + /// It being a child token enables us to provide a [`Self::shutdown`] method. fn spawn( + cancel_parent: &CancellationToken, task: impl FnOnce(watch::Sender>, CancellationToken) -> Fut + Send + 'static, ) -> Self where Fut: Future> + Send, E: Send + Sync + 'static, { - let cancellation = CancellationToken::new(); + let cancellation = cancel_parent.child_token(); let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started); let cancellation_clone = cancellation.clone(); diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 030d24a017..dae31934ad 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -280,6 +280,8 @@ pub(super) struct ConnectionManagerState { id: TenantTimelineId, /// Use pageserver data about the timeline to filter out some of the safekeepers. timeline: Arc, + /// Child token of [`super::WalReceiver::cancel`], inherited to all tasks we spawn. + cancel: CancellationToken, conf: WalReceiverConf, /// Current connection to safekeeper for WAL streaming. wal_connection: Option, @@ -402,7 +404,11 @@ struct BrokerSkTimeline { } impl ConnectionManagerState { - pub(super) fn new(timeline: Arc, conf: WalReceiverConf) -> Self { + pub(super) fn new( + timeline: Arc, + conf: WalReceiverConf, + cancel: CancellationToken, + ) -> Self { let id = TenantTimelineId { tenant_id: timeline.tenant_shard_id.tenant_id, timeline_id: timeline.timeline_id, @@ -410,6 +416,7 @@ impl ConnectionManagerState { Self { id, timeline, + cancel, conf, wal_connection: None, wal_stream_candidates: HashMap::new(), @@ -417,6 +424,22 @@ impl ConnectionManagerState { } } + fn spawn( + &self, + task: impl FnOnce( + tokio::sync::watch::Sender>, + CancellationToken, + ) -> Fut + + Send + + 'static, + ) -> TaskHandle + where + Fut: std::future::Future> + Send, + { + // TODO: get rid of TaskHandle + super::TaskHandle::spawn(&self.cancel, task) + } + /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { WALRECEIVER_SWITCHES @@ -435,7 +458,7 @@ impl ConnectionManagerState { ); let span = info_span!("connection", %node_id); - let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| { + let connection_handle = self.spawn(move |events_sender, cancellation| { async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -463,6 +486,12 @@ impl ConnectionManagerState { info!("walreceiver connection handling ended: {e}"); Ok(()) } + WalReceiverError::ClosedGate => { + info!( + "walreceiver connection handling ended because of closed gate" + ); + Ok(()) + } WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { @@ -1016,7 +1045,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1184,7 +1213,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1251,7 +1280,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); @@ -1315,7 +1344,7 @@ mod tests { sk_id: NodeId(1), availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), + connection_task: state.spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { discovered_at: time_over_threshold, lsn: new_lsn, @@ -1371,6 +1400,7 @@ mod tests { timeline_id: TIMELINE_ID, }, timeline, + cancel: CancellationToken::new(), conf: WalReceiverConf { wal_connect_timeout: Duration::from_secs(1), lagging_wal_timeout: Duration::from_secs(1), @@ -1414,7 +1444,7 @@ mod tests { sk_id: connected_sk_id, availability_zone: None, status: connection_status, - connection_task: TaskHandle::spawn(move |sender, _| async move { + connection_task: state.spawn(move |sender, _| async move { sender .send(TaskStateUpdate::Progress(connection_status)) .ok(); diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 00a9dbd760..3f3419e886 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -27,7 +27,6 @@ use super::TaskStateUpdate; use crate::{ context::RequestContext, metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, - task_mgr, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, @@ -37,8 +36,8 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use utils::pageserver_feedback::PageserverFeedback; use utils::{id::NodeId, lsn::Lsn}; +use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; /// Status of the connection. #[derive(Debug, Clone, Copy)] @@ -68,6 +67,7 @@ pub(super) enum WalReceiverError { SuccessfulCompletion(String), /// Generic error Other(anyhow::Error), + ClosedGate, } impl From for WalReceiverError { @@ -119,6 +119,16 @@ pub(super) async fn handle_walreceiver_connection( ) -> Result<(), WalReceiverError> { debug_assert_current_span_has_tenant_and_timeline_id(); + // prevent timeline shutdown from finishing until we have exited + let _guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + // This function spawns a side-car task (WalReceiverConnectionPoller). + // Get its gate guard now as well. + let poller_guard = timeline.gate.enter().map_err(|e| match e { + GateError::GateClosed => WalReceiverError::ClosedGate, + })?; + WALRECEIVER_STARTED_CONNECTIONS.inc(); // Connect to the database in replication mode. @@ -156,22 +166,19 @@ pub(super) async fn handle_walreceiver_connection( } // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. + // so spawn it off to run on its own. It shouldn't outlive this function, but, + // due to lack of async drop, we can't enforce that. However, we ensure that + // 1. it is sensitive to `cancellation` and + // 2. holds the Timeline gate open so that after timeline shutdown, + // we know this task is gone. let _connection_ctx = ctx.detached_child( TaskKind::WalReceiverConnectionPoller, ctx.download_behavior(), ); let connection_cancellation = cancellation.clone(); - task_mgr::spawn( - WALRECEIVER_RUNTIME.handle(), - TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id), - Some(timeline.timeline_id), - "walreceiver connection", - false, + WALRECEIVER_RUNTIME.spawn( async move { debug_assert_current_span_has_tenant_and_timeline_id(); - select! { connection_result = connection => match connection_result { Ok(()) => debug!("Walreceiver db connection closed"), @@ -182,6 +189,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::ClosedGate => { + // doesn't happen at runtime + } WalReceiverError::Other(err) => { warn!("Connection aborted: {err:#}") } @@ -190,7 +200,7 @@ pub(super) async fn handle_walreceiver_connection( }, _ = connection_cancellation.cancelled() => debug!("Connection cancelled"), } - Ok(()) + drop(poller_guard); } // Enrich the log lines emitted by this closure with meaningful context. // TODO: technically, this task outlives the surrounding function, so, the @@ -303,6 +313,7 @@ pub(super) async fn handle_walreceiver_connection( trace!("received XLogData between {startlsn} and {endlsn}"); + WAL_INGEST.bytes_received.inc_by(data.len() as u64); waldecoder.feed_bytes(data); { diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs index 830c9897ca..5eccf185ac 100644 --- a/pageserver/src/utilization.rs +++ b/pageserver/src/utilization.rs @@ -15,11 +15,23 @@ pub(crate) fn regenerate(tenants_path: &Path) -> anyhow::Result 0 { + statvfs.fragment_size() + } else { + statvfs.block_size() + }; #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] let free = statvfs.blocks_available() as u64 * blocksz; - let used = crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.get(); + + #[cfg_attr(not(target_os = "macos"), allow(clippy::unnecessary_cast))] + let used = statvfs + .blocks() + // use blocks_free instead of available here to match df in case someone compares + .saturating_sub(statvfs.blocks_free()) as u64 + * blocksz; + let captured_at = std::time::SystemTime::now(); let doc = PageserverUtilization { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 0004f4f3c9..ca41a576fd 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -36,11 +36,12 @@ use bytes::{Bytes, BytesMut}; use pageserver_api::key::key_to_rel_block; use pageserver_api::models::WalRedoManagerStatus; use pageserver_api::shard::TenantShardId; -use std::sync::{Arc, RwLock}; +use std::sync::Arc; use std::time::Duration; use std::time::Instant; use tracing::*; use utils::lsn::Lsn; +use utils::sync::heavier_once_cell; /// /// This is the real implementation that uses a Postgres process to @@ -53,7 +54,19 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - redo_process: RwLock>>, + /// The current [`process::WalRedoProcess`] that is used by new redo requests. + /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo + /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// their process object; we use [`Arc::clone`] for that. + /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] + /// had that behavior; it's probably unnecessary. + /// The only merit of it is that if one walredo process encounters an error, + /// it can take it out of rotation (= using [`heavier_once_cell::Guard::take_and_deinit`]. + /// and retry redo, thereby starting the new process, while other redo tasks might + /// still be using the old redo process. But, those other tasks will most likely + /// encounter an error as well, and errors are an unexpected condition anyway. + /// So, probably we could get rid of the `Arc` in the future. + redo_process: heavier_once_cell::OnceCell>, } /// @@ -101,6 +114,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await }; img = Some(result?); @@ -121,6 +135,7 @@ impl PostgresRedoManager { self.conf.wal_redo_timeout, pg_version, ) + .await } } @@ -134,7 +149,7 @@ impl PostgresRedoManager { chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, - pid: self.redo_process.read().unwrap().as_ref().map(|p| p.id()), + pid: self.redo_process.get().map(|p| p.id()), }) } } @@ -152,7 +167,7 @@ impl PostgresRedoManager { tenant_shard_id, conf, last_redo_at: std::sync::Mutex::default(), - redo_process: RwLock::new(None), + redo_process: heavier_once_cell::OnceCell::default(), } } @@ -164,8 +179,7 @@ impl PostgresRedoManager { if let Some(last_redo_at) = *g { if last_redo_at.elapsed() >= idle_timeout { drop(g); - let mut guard = self.redo_process.write().unwrap(); - *guard = None; + drop(self.redo_process.get().map(|guard| guard.take_and_deinit())); } } } @@ -174,8 +188,11 @@ impl PostgresRedoManager { /// /// Process one request for WAL redo using wal-redo postgres /// + /// # Cancel-Safety + /// + /// Cancellation safe. #[allow(clippy::too_many_arguments)] - fn apply_batch_postgres( + async fn apply_batch_postgres( &self, key: Key, lsn: Lsn, @@ -191,42 +208,31 @@ impl PostgresRedoManager { const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - // launch the WAL redo process on first use - let proc: Arc = { - let proc_guard = self.redo_process.read().unwrap(); - match &*proc_guard { - None => { - // "upgrade" to write lock to launch the process - drop(proc_guard); - let mut proc_guard = self.redo_process.write().unwrap(); - match &*proc_guard { - None => { - let start = Instant::now(); - let proc = Arc::new( - process::WalRedoProcess::launch( - self.conf, - self.tenant_shard_id, - pg_version, - ) - .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM - .observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - *proc_guard = Some(Arc::clone(&proc)); - proc - } - Some(proc) => Arc::clone(proc), - } + let proc: Arc = + match self.redo_process.get_or_init_detached().await { + Ok(guard) => Arc::clone(&guard), + Err(permit) => { + // don't hold poison_guard, the launch code can bail + let start = Instant::now(); + let proc = Arc::new( + process::WalRedoProcess::launch( + self.conf, + self.tenant_shard_id, + pg_version, + ) + .context("launch walredo process")?, + ); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process.set(Arc::clone(&proc), permit); + proc } - Some(proc) => Arc::clone(proc), - } - }; + }; let started_at = std::time::Instant::now(); @@ -272,34 +278,34 @@ impl PostgresRedoManager { n_attempts, e, ); - // Avoid concurrent callers hitting the same issue. - // We can't prevent it from happening because we want to enable parallelism. - { - let mut guard = self.redo_process.write().unwrap(); - match &*guard { - Some(current_field_value) => { - if Arc::ptr_eq(current_field_value, &proc) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - *guard = None; - } - } - None => { - // Another thread was faster to observe the error, and already took the process out of rotation. - } - } - } + // Avoid concurrent callers hitting the same issue by taking `proc` out of the rotation. + // Note that there may be other tasks concurrent with us that also hold `proc`. + // We have to deal with that here. + // Also read the doc comment on field `self.redo_process`. + // // NB: there may still be other concurrent threads using `proc`. // The last one will send SIGKILL when the underlying Arc reaches refcount 0. - // NB: it's important to drop(proc) after drop(guard). Otherwise we'd keep - // holding the lock while waiting for the process to exit. - // NB: the drop impl blocks the current threads with a wait() system call for - // the child process. We dropped the `guard` above so that other threads aren't - // affected. But, it's good that the current thread _does_ block to wait. - // If we instead deferred the waiting into the background / to tokio, it could - // happen that if walredo always fails immediately, we spawn processes faster + // + // NB: the drop impl blocks the dropping thread with a wait() system call for + // the child process. In some ways the blocking is actually good: if we + // deferred the waiting into the background / to tokio if we used `tokio::process`, + // it could happen that if walredo always fails immediately, we spawn processes faster // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here, // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads. // This probably needs revisiting at some later point. + match self.redo_process.get() { + None => (), + Some(guard) => { + if Arc::ptr_eq(&proc, &*guard) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } + } + // The last task that does this `drop()` of `proc` will do a blocking `wait()` syscall. drop(proc); } else if n_attempts != 0 { info!(n_attempts, "retried walredo succeeded"); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 1bc8a2e87c..2276b4e807 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -495,16 +495,17 @@ retry: static void pageserver_disconnect(shardno_t shard_no) { - if (page_servers[shard_no].conn) - { - /* - * If the connection to any pageserver is lost, we throw away the - * whole prefetch queue, even for other pageservers. It should not - * cause big problems, because connection loss is supposed to be a - * rare event. - */ - prefetch_on_ps_disconnect(); - } + /* + * If the connection to any pageserver is lost, we throw away the + * whole prefetch queue, even for other pageservers. It should not + * cause big problems, because connection loss is supposed to be a + * rare event. + * + * Prefetch state should be reset even if page_servers[shard_no].conn == NULL, + * because prefetch request may be registered before connection is established. + */ + prefetch_on_ps_disconnect(); + pageserver_disconnect_shard(shard_no); } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index b33cfab2bb..57a16e00ca 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -641,13 +641,12 @@ prefetch_on_ps_disconnect(void) static inline void prefetch_set_unused(uint64 ring_index) { - PrefetchRequest *slot = GetPrfSlot(ring_index); + PrefetchRequest *slot; if (ring_index < MyPState->ring_last) return; /* Should already be unused */ - Assert(MyPState->ring_unused > ring_index); - + slot = GetPrfSlot(ring_index); if (slot->status == PRFS_UNUSED) return; @@ -806,7 +805,8 @@ Retry: { if (*force_lsn > slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -821,7 +821,8 @@ Retry: { if (*force_lsn != slot->effective_request_lsn) { - prefetch_wait_for(ring_index); + if (!prefetch_wait_for(ring_index)) + goto Retry; prefetch_set_unused(ring_index); entry = NULL; } @@ -887,7 +888,8 @@ Retry: { case PRFS_REQUESTED: Assert(MyPState->ring_receive == cleanup_index); - prefetch_wait_for(cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; prefetch_set_unused(cleanup_index); break; case PRFS_RECEIVED: @@ -2140,6 +2142,7 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* * Try to find prefetched page in the list of received pages. */ + Retry: entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag); if (entry != NULL) @@ -2161,7 +2164,8 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, */ if (slot->status == PRFS_REQUESTED) { - prefetch_wait_for(slot->my_ring_index); + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; } /* drop caches */ prefetch_set_unused(slot->my_ring_index); diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 57a2736d5b..b327890be2 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -10,6 +10,7 @@ testing = [] [dependencies] anyhow.workspace = true +async-compression.workspace = true async-trait.workspace = true aws-config.workspace = true aws-sdk-iam.workspace = true diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs index 7db76f3d9e..415a4b7d85 100644 --- a/proxy/src/auth/backend/link.rs +++ b/proxy/src/auth/backend/link.rs @@ -102,8 +102,7 @@ pub(super) async fn authenticate( ctx.set_user(db_info.user.into()); ctx.set_project(db_info.aux.clone()); - let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default(); - info!(?cold_start_info, "woken up a compute node"); + info!("woken up a compute node"); // Backwards compatibility. pg_sni_proxy uses "--" in domain names // while direct connections do not. Once we migrate to pg_sni_proxy diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 385f7820cb..c28814b1c8 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -10,6 +10,7 @@ use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::proxy::run_until_cancelled; +use proxy::{BranchId, EndpointId, ProjectId}; use rustls::pki_types::PrivateKeyDer; use tokio::net::TcpListener; @@ -269,7 +270,12 @@ async fn handle_client( let client = tokio::net::TcpStream::connect(destination).await?; - let metrics_aux: MetricsAuxInfo = Default::default(); + let metrics_aux: MetricsAuxInfo = MetricsAuxInfo { + endpoint_id: (&EndpointId::from("")).into(), + project_id: (&ProjectId::from("")).into(), + branch_id: (&BranchId::from("")).into(), + cold_start_info: proxy::console::messages::ColdStartInfo::Unknown, + }; // doesn't yet matter as pg-sni-router doesn't report analytics logs ctx.set_success(); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 88b847f5f1..56a3ef79cd 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -10,6 +10,7 @@ use proxy::auth; use proxy::auth::backend::MaybeOwned; use proxy::cancellation::CancelMap; use proxy::cancellation::CancellationHandler; +use proxy::config::remote_storage_from_toml; use proxy::config::AuthenticationConfig; use proxy::config::CacheOptions; use proxy::config::HttpConfig; @@ -191,6 +192,19 @@ struct ProxyCliArgs { #[clap(flatten)] parquet_upload: ParquetUploadArgs, + + /// interval for backup metric collection + #[clap(long, default_value = "10m", value_parser = humantime::parse_duration)] + metric_backup_collection_interval: std::time::Duration, + /// remote storage configuration for backup metric collection + /// Encoded as toml (same format as pageservers), eg + /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` + #[clap(long, default_value = "{}")] + metric_backup_collection_remote_storage: String, + /// chunk size for backup metric collection + /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. + #[clap(long, default_value = "4194304")] + metric_backup_collection_chunk_size: usize, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -372,12 +386,17 @@ async fn main() -> anyhow::Result<()> { // maintenance tasks. these never return unless there's an error let mut maintenance_tasks = JoinSet::new(); - maintenance_tasks.spawn(proxy::handle_signals(cancellation_token)); + maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone())); maintenance_tasks.spawn(http::health_server::task_main(http_listener)); maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener)); if let Some(metrics_config) = &config.metric_collection { + // TODO: Add gc regardles of the metric collection being enabled. maintenance_tasks.spawn(usage_metrics::task_main(metrics_config)); + client_tasks.spawn(usage_metrics::task_backup( + &metrics_config.backup_metric_collection_config, + cancellation_token, + )); } if let auth::BackendType::Console(api, _) = &config.auth_backend { @@ -434,6 +453,13 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { if args.allow_self_signed_compute { warn!("allowing self-signed compute certificates"); } + let backup_metric_collection_config = config::MetricBackupCollectionConfig { + interval: args.metric_backup_collection_interval, + remote_storage_config: remote_storage_from_toml( + &args.metric_backup_collection_remote_storage, + )?, + chunk_size: args.metric_backup_collection_chunk_size, + }; let metric_collection = match ( &args.metric_collection_endpoint, @@ -442,6 +468,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(endpoint), Some(interval)) => Some(config::MetricCollectionConfig { endpoint: endpoint.parse()?, interval: humantime::parse_duration(interval)?, + backup_metric_collection_config, }), (None, None) => None, _ => bail!( diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index 5a3660520b..d8a1d261ce 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -16,7 +16,7 @@ use crate::{ config::ProjectInfoCacheOptions, console::AuthSecret, intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, - EndpointId, ProjectId, RoleName, + EndpointId, RoleName, }; use super::{Cache, Cached}; @@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl { } pub fn insert_role_secret( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, - role_name: &RoleName, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, + role_name: RoleNameInt, secret: Option, ) { - let project_id = ProjectIdInt::from(project_id); - let endpoint_id = EndpointIdInt::from(endpoint_id); - let role_name = RoleNameInt::from(role_name); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; @@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl { } pub fn insert_allowed_ips( &self, - project_id: &ProjectId, - endpoint_id: &EndpointId, + project_id: ProjectIdInt, + endpoint_id: EndpointIdInt, allowed_ips: Arc>, ) { - let project_id = ProjectIdInt::from(project_id); - let endpoint_id = EndpointIdInt::from(endpoint_id); if self.cache.len() >= self.config.size { // If there are too many entries, wait until the next gc cycle. return; @@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::scram::ServerSecret; + use crate::{scram::ServerSecret, ProjectId}; #[tokio::test] async fn test_project_info_cache_settings() { @@ -369,8 +364,8 @@ mod tests { ttl: Duration::from_secs(1), gc_interval: Duration::from_secs(600), }); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); @@ -379,9 +374,23 @@ mod tests { "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); assert!(cached.cached()); @@ -393,7 +402,12 @@ mod tests { // Shouldn't add more than 2 roles. let user3: RoleName = "user3".into(); let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32]))); - cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user3).into(), + secret3.clone(), + ); assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); @@ -421,8 +435,8 @@ mod tests { cache.clone().disable_ttl(); tokio::time::advance(Duration::from_secs(2)).await; - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); @@ -431,9 +445,23 @@ mod tests { "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); tokio::time::advance(Duration::from_secs(2)).await; // Nothing should be invalidated. @@ -470,8 +498,8 @@ mod tests { gc_interval: Duration::from_secs(600), })); - let project_id = "project".into(); - let endpoint_id = "endpoint".into(); + let project_id: ProjectId = "project".into(); + let endpoint_id: EndpointId = "endpoint".into(); let user1: RoleName = "user1".into(); let user2: RoleName = "user2".into(); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); @@ -480,10 +508,20 @@ mod tests { "127.0.0.1".parse().unwrap(), "127.0.0.2".parse().unwrap(), ]); - cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user1).into(), + secret1.clone(), + ); cache.clone().disable_ttl(); tokio::time::advance(Duration::from_millis(100)).await; - cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); + cache.insert_role_secret( + (&project_id).into(), + (&endpoint_id).into(), + (&user2).into(), + secret2.clone(), + ); // Added before ttl was disabled + ttl should be still cached. let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); @@ -497,7 +535,11 @@ mod tests { assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); // Added after ttl was disabled + ttl should not be cached. - cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); + cache.insert_allowed_ips( + (&project_id).into(), + (&endpoint_id).into(), + allowed_ips.clone(), + ); let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); assert!(!cached.cached()); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 65153babcb..ee33b97fbd 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -276,6 +276,7 @@ impl ConnCfg { let stream = connection.stream.into_inner(); info!( + cold_start_info = ctx.cold_start_info.as_str(), "connected to compute node at {host} ({socket_addr}) sslmode={:?}", self.0.get_ssl_mode() ); diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 361c3ef519..fc490c7348 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -5,6 +5,7 @@ use crate::{ }; use anyhow::{bail, ensure, Context, Ok}; use itertools::Itertools; +use remote_storage::RemoteStorageConfig; use rustls::{ crypto::ring::sign, pki_types::{CertificateDer, PrivateKeyDer}, @@ -39,6 +40,7 @@ pub struct ProxyConfig { pub struct MetricCollectionConfig { pub endpoint: reqwest::Url, pub interval: Duration, + pub backup_metric_collection_config: MetricBackupCollectionConfig, } pub struct TlsConfig { @@ -311,6 +313,21 @@ impl CertResolver { } } +#[derive(Debug)] +pub struct MetricBackupCollectionConfig { + pub interval: Duration, + pub remote_storage_config: OptRemoteStorageConfig, + pub chunk_size: usize, +} + +/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get +/// runtime type errors from the value parser we use. +pub type OptRemoteStorageConfig = Option; + +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { + RemoteStorageConfig::from_toml(&s.parse()?) +} + /// Helper for cmdline cache options parsing. #[derive(Debug)] pub struct CacheOptions { diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index 102076f2c6..45161f5ac8 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -3,7 +3,7 @@ use std::fmt; use crate::auth::IpPattern; -use crate::{BranchId, EndpointId, ProjectId}; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. @@ -18,7 +18,7 @@ pub struct ConsoleError { pub struct GetRoleSecret { pub role_secret: Box, pub allowed_ips: Option>, - pub project_id: Option, + pub project_id: Option, } // Manually implement debug to omit sensitive info. @@ -93,22 +93,47 @@ impl fmt::Debug for DatabaseInfo { /// Various labels for prometheus metrics. /// Also known as `ProxyMetricsAuxInfo` in the console. -#[derive(Debug, Deserialize, Clone, Default)] +#[derive(Debug, Deserialize, Clone)] pub struct MetricsAuxInfo { - pub endpoint_id: EndpointId, - pub project_id: ProjectId, - pub branch_id: BranchId, - pub cold_start_info: Option, + pub endpoint_id: EndpointIdInt, + pub project_id: ProjectIdInt, + pub branch_id: BranchIdInt, + #[serde(default)] + pub cold_start_info: ColdStartInfo, } -#[derive(Debug, Default, Serialize, Deserialize, Clone)] +#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)] #[serde(rename_all = "snake_case")] pub enum ColdStartInfo { #[default] - Unknown = 0, - Warm = 1, - PoolHit = 2, - PoolMiss = 3, + Unknown, + /// Compute was already running + Warm, + #[serde(rename = "pool_hit")] + /// Compute was not running but there was an available VM + VmPoolHit, + #[serde(rename = "pool_miss")] + /// Compute was not running and there were no VMs available + VmPoolMiss, + + // not provided by control plane + /// Connection available from HTTP pool + HttpPoolHit, + /// Cached connection info + WarmCached, +} + +impl ColdStartInfo { + pub fn as_str(&self) -> &'static str { + match self { + ColdStartInfo::Unknown => "unknown", + ColdStartInfo::Warm => "warm", + ColdStartInfo::VmPoolHit => "pool_hit", + ColdStartInfo::VmPoolMiss => "pool_miss", + ColdStartInfo::HttpPoolHit => "http_pool_hit", + ColdStartInfo::WarmCached => "warm_cached", + } + } } #[cfg(test)] diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 69bfd6b045..f7d621fb12 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -12,7 +12,8 @@ use crate::{ compute, config::{CacheOptions, ProjectInfoCacheOptions}, context::RequestMonitoring, - scram, EndpointCacheKey, ProjectId, + intern::ProjectIdInt, + scram, EndpointCacheKey, }; use dashmap::DashMap; use std::{sync::Arc, time::Duration}; @@ -271,7 +272,7 @@ pub struct AuthInfo { /// List of IP addresses allowed for the autorization. pub allowed_ips: Vec, /// Project ID. This is used for cache invalidation. - pub project_id: Option, + pub project_id: Option, } /// Info for establishing a connection to a compute node. diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index b759c81373..cfe491f2aa 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -4,10 +4,16 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; -use crate::console::provider::{CachedAllowedIps, CachedRoleSecret}; use crate::context::RequestMonitoring; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; +use crate::{ + console::{ + messages::MetricsAuxInfo, + provider::{CachedAllowedIps, CachedRoleSecret}, + }, + BranchId, EndpointId, ProjectId, +}; use futures::TryFutureExt; use std::{str::FromStr, sync::Arc}; use thiserror::Error; @@ -114,7 +120,12 @@ impl Api { let node = NodeInfo { config, - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 289b0c08f7..1a3e2ca795 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -181,15 +181,16 @@ impl super::Api for Api { } let auth_info = self.do_get_auth_info(ctx, user_info).await?; if let Some(project_id) = auth_info.project_id { + let ep_int = ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + ep_int, + user.into(), auth_info.secret.clone(), ); self.caches.project_info.insert_allowed_ips( - &project_id, - ep, + project_id, + ep_int, Arc::new(auth_info.allowed_ips), ); ctx.set_project_id(project_id); @@ -217,15 +218,16 @@ impl super::Api for Api { let allowed_ips = Arc::new(auth_info.allowed_ips); let user = &user_info.user; if let Some(project_id) = auth_info.project_id { + let ep_int = ep.into(); self.caches.project_info.insert_role_secret( - &project_id, - ep, - user, + project_id, + ep_int, + user.into(), auth_info.secret.clone(), ); self.caches .project_info - .insert_allowed_ips(&project_id, ep, allowed_ips.clone()); + .insert_allowed_ips(project_id, ep_int, allowed_ips.clone()); ctx.set_project_id(project_id); } Ok(( @@ -248,8 +250,7 @@ impl super::Api for Api { // which means that we might cache it to reduce the load and latency. if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); + ctx.set_project(cached.aux.clone()); return Ok(cached); } @@ -260,17 +261,21 @@ impl super::Api for Api { if permit.should_check_cache() { if let Some(cached) = self.caches.node_info.get(&key) { info!(key = &*key, "found cached compute node info"); - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); + ctx.set_project(cached.aux.clone()); return Ok(cached); } } - let node = self.do_wake_compute(ctx, user_info).await?; + let mut node = self.do_wake_compute(ctx, user_info).await?; ctx.set_project(node.aux.clone()); - let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default(); - info!(?cold_start_info, "woken up a compute node"); - let (_, cached) = self.caches.node_info.insert(key.clone(), node); + let cold_start_info = node.aux.cold_start_info; + info!("woken up a compute node"); + + // store the cached node as 'warm' + node.aux.cold_start_info = ColdStartInfo::WarmCached; + let (_, mut cached) = self.caches.node_info.insert(key.clone(), node); + cached.aux.cold_start_info = cold_start_info; + info!(key = &*key, "created a cache entry for compute node info"); Ok(cached) diff --git a/proxy/src/context.rs b/proxy/src/context.rs index 7ca830cdb4..fec95f4722 100644 --- a/proxy/src/context.rs +++ b/proxy/src/context.rs @@ -11,8 +11,9 @@ use uuid::Uuid; use crate::{ console::messages::{ColdStartInfo, MetricsAuxInfo}, error::ErrorKind, + intern::{BranchIdInt, ProjectIdInt}, metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, - BranchId, DbName, EndpointId, ProjectId, RoleName, + DbName, EndpointId, RoleName, }; use self::parquet::RequestData; @@ -34,8 +35,8 @@ pub struct RequestMonitoring { pub span: Span, // filled in as they are discovered - project: Option, - branch: Option, + project: Option, + branch: Option, endpoint_id: Option, dbname: Option, user: Option, @@ -43,7 +44,7 @@ pub struct RequestMonitoring { error_kind: Option, pub(crate) auth_method: Option, success: bool, - cold_start_info: Option, + pub(crate) cold_start_info: ColdStartInfo, // extra // This sender is here to keep the request monitoring channel open while requests are taking place. @@ -92,7 +93,7 @@ impl RequestMonitoring { error_kind: None, auth_method: None, success: false, - cold_start_info: None, + cold_start_info: ColdStartInfo::Unknown, sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), latency_timer: LatencyTimer::new(protocol), @@ -113,26 +114,31 @@ impl RequestMonitoring { } pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { - self.cold_start_info = Some(info); + self.cold_start_info = info; + self.latency_timer.cold_start_info(info); } pub fn set_project(&mut self, x: MetricsAuxInfo) { - self.set_endpoint_id(x.endpoint_id); + if self.endpoint_id.is_none() { + self.set_endpoint_id(x.endpoint_id.as_str().into()) + } self.branch = Some(x.branch_id); self.project = Some(x.project_id); - self.cold_start_info = x.cold_start_info; + self.set_cold_start_info(x.cold_start_info); } - pub fn set_project_id(&mut self, project_id: ProjectId) { + pub fn set_project_id(&mut self, project_id: ProjectIdInt) { self.project = Some(project_id); } pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { - self.span.record("ep", display(&endpoint_id)); - crate::metrics::CONNECTING_ENDPOINTS - .with_label_values(&[self.protocol]) - .measure(&endpoint_id); - self.endpoint_id = Some(endpoint_id); + if self.endpoint_id.is_none() { + self.span.record("ep", display(&endpoint_id)); + crate::metrics::CONNECTING_ENDPOINTS + .with_label_values(&[self.protocol]) + .measure(&endpoint_id); + self.endpoint_id = Some(endpoint_id); + } } pub fn set_application(&mut self, app: Option) { diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index a2be1c4186..eb77409429 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -13,12 +13,14 @@ use parquet::{ }, record::RecordWriter, }; -use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; +use crate::config::{remote_storage_from_toml, OptRemoteStorageConfig}; + use super::{RequestMonitoring, LOG_CHAN}; #[derive(clap::Args, Clone, Debug)] @@ -50,21 +52,13 @@ pub struct ParquetUploadArgs { parquet_upload_compression: Compression, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -type OptRemoteStorageConfig = Option; - -fn remote_storage_from_toml(s: &str) -> anyhow::Result { - RemoteStorageConfig::from_toml(&s.parse()?) -} - // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a upload fails, we log it at info-level, and retry. // But after FAILED_UPLOAD_WARN_THRESHOLD retries, we start to log it at WARN // level instead, as repeated failures can mean a more serious problem. If it // fails more than FAILED_UPLOAD_RETRIES times, we give up -pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; -pub(crate) const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; +pub const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3; +pub const FAILED_UPLOAD_MAX_RETRIES: u32 = 10; // the parquet crate leaves a lot to be desired... // what follows is an attempt to write parquet files with minimal allocs. @@ -93,7 +87,7 @@ pub struct RequestData { /// Or if we make it to proxy_pass success: bool, /// Indicates if the cplane started the new compute node for this request. - cold_start_info: Option<&'static str>, + cold_start_info: &'static str, /// Tracks time from session start (HTTP request/libpq TCP handshake) /// Through to success/failure duration_us: u64, @@ -121,12 +115,7 @@ impl From<&RequestMonitoring> for RequestData { region: value.region, error: value.error_kind.as_ref().map(|e| e.to_metric_label()), success: value.success, - cold_start_info: value.cold_start_info.as_ref().map(|x| match x { - crate::console::messages::ColdStartInfo::Unknown => "unknown", - crate::console::messages::ColdStartInfo::Warm => "warm", - crate::console::messages::ColdStartInfo::PoolHit => "pool_hit", - crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss", - }), + cold_start_info: value.cold_start_info.as_str(), duration_us: SystemTime::from(value.first_packet) .elapsed() .unwrap_or_default() @@ -460,7 +449,7 @@ mod tests { region: "us-east-1", error: None, success: rng.gen(), - cold_start_info: Some("no"), + cold_start_info: "no", duration_us: rng.gen_range(0..30_000_000), } } @@ -530,15 +519,15 @@ mod tests { assert_eq!( file_stats, [ - (1314406, 3, 6000), - (1314399, 3, 6000), - (1314459, 3, 6000), - (1314416, 3, 6000), - (1314546, 3, 6000), - (1314388, 3, 6000), - (1314180, 3, 6000), - (1314416, 3, 6000), - (438359, 1, 2000) + (1314385, 3, 6000), + (1314378, 3, 6000), + (1314438, 3, 6000), + (1314395, 3, 6000), + (1314525, 3, 6000), + (1314367, 3, 6000), + (1314159, 3, 6000), + (1314395, 3, 6000), + (438352, 1, 2000) ] ); @@ -568,11 +557,11 @@ mod tests { assert_eq!( file_stats, [ - (1220668, 5, 10000), - (1226818, 5, 10000), - (1228612, 5, 10000), - (1227974, 5, 10000), - (1219252, 5, 10000) + (1220633, 5, 10000), + (1226783, 5, 10000), + (1228577, 5, 10000), + (1227939, 5, 10000), + (1219217, 5, 10000) ] ); @@ -604,11 +593,11 @@ mod tests { assert_eq!( file_stats, [ - (1206315, 5, 10000), - (1206046, 5, 10000), - (1206339, 5, 10000), - (1206327, 5, 10000), - (1206582, 5, 10000) + (1206280, 5, 10000), + (1206011, 5, 10000), + (1206304, 5, 10000), + (1206292, 5, 10000), + (1206547, 5, 10000) ] ); @@ -633,15 +622,15 @@ mod tests { assert_eq!( file_stats, [ - (1314406, 3, 6000), - (1314399, 3, 6000), - (1314459, 3, 6000), - (1314416, 3, 6000), - (1314546, 3, 6000), - (1314388, 3, 6000), - (1314180, 3, 6000), - (1314416, 3, 6000), - (438359, 1, 2000) + (1314385, 3, 6000), + (1314378, 3, 6000), + (1314438, 3, 6000), + (1314395, 3, 6000), + (1314525, 3, 6000), + (1314367, 3, 6000), + (1314159, 3, 6000), + (1314395, 3, 6000), + (438352, 1, 2000) ] ); @@ -678,7 +667,7 @@ mod tests { // files are smaller than the size threshold, but they took too long to fill so were flushed early assert_eq!( file_stats, - [(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)] + [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)] ); tmpdir.close().unwrap(); diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 4172dc19da..59ee899c08 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -12,6 +12,8 @@ use metrics::{ use once_cell::sync::Lazy; use tokio::time::{self, Instant}; +use crate::console::messages::ColdStartInfo; + pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { register_int_counter_pair_vec!( "proxy_opened_db_connections_total", @@ -50,8 +52,8 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { "proxy_compute_connection_latency_seconds", "Time it took for proxy to establish a connection to the compute endpoint", // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane - // 3 * 2 * 2 * 2 * 2 = 48 counters - &["protocol", "cache_miss", "pool_miss", "outcome", "excluded"], + // 3 * 6 * 2 * 2 = 72 counters + &["protocol", "cold_start_info", "outcome", "excluded"], // largest bucket = 2^16 * 0.5ms = 32s exponential_buckets(0.0005, 2.0, 16).unwrap(), ) @@ -117,12 +119,15 @@ pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { .unwrap() }); -pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { - register_histogram!( +pub static HTTP_CONTENT_LENGTH: Lazy = Lazy::new(|| { + register_histogram_vec!( "proxy_http_conn_content_length_bytes", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(8.0, 2.0, 20).unwrap() + "Number of bytes the HTTP response content consumes", + // request/response + &["direction"], + // smallest bucket = 16 bytes + // largest bucket = 4^12 * 16 bytes = 256MB + exponential_buckets(16.0, 4.0, 12).unwrap() ) .unwrap() }); @@ -180,6 +185,20 @@ struct Accumulated { compute: time::Duration, } +enum Outcome { + Success, + Failed, +} + +impl Outcome { + fn as_str(&self) -> &'static str { + match self { + Outcome::Success => "success", + Outcome::Failed => "failed", + } + } +} + pub struct LatencyTimer { // time since the stopwatch was started start: time::Instant, @@ -189,9 +208,8 @@ pub struct LatencyTimer { accumulated: Accumulated, // label data protocol: &'static str, - cache_miss: bool, - pool_miss: bool, - outcome: &'static str, + cold_start_info: ColdStartInfo, + outcome: Outcome, } pub struct LatencyTimerPause<'a> { @@ -207,11 +225,9 @@ impl LatencyTimer { stop: None, accumulated: Accumulated::default(), protocol, - cache_miss: false, - // by default we don't do pooling - pool_miss: true, + cold_start_info: ColdStartInfo::Unknown, // assume failed unless otherwise specified - outcome: "failed", + outcome: Outcome::Failed, } } @@ -223,12 +239,8 @@ impl LatencyTimer { } } - pub fn cache_miss(&mut self) { - self.cache_miss = true; - } - - pub fn pool_hit(&mut self) { - self.pool_miss = false; + pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) { + self.cold_start_info = cold_start_info; } pub fn success(&mut self) { @@ -236,7 +248,7 @@ impl LatencyTimer { self.stop = Some(time::Instant::now()); // success - self.outcome = "success"; + self.outcome = Outcome::Success; } } @@ -261,9 +273,8 @@ impl Drop for LatencyTimer { COMPUTE_CONNECTION_LATENCY .with_label_values(&[ self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, + self.cold_start_info.as_str(), + self.outcome.as_str(), "client", ]) .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); @@ -272,9 +283,8 @@ impl Drop for LatencyTimer { COMPUTE_CONNECTION_LATENCY .with_label_values(&[ self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, + self.cold_start_info.as_str(), + self.outcome.as_str(), "client_and_cplane", ]) .observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index c76e2ff6d9..4c0d68ce0b 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> { } /// Try to connect to the compute node, retrying if necessary. -/// This function might update `node_info`, so we take it by `&mut`. #[tracing::instrument(skip_all)] pub async fn connect_to_compute( ctx: &mut RequestMonitoring, @@ -132,7 +131,6 @@ where } else { // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node info!("compute node's state has likely changed; requesting a wake-up"); - ctx.latency_timer.cache_miss(); let old_node_info = invalidate_cache(node_info); let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; node_info.reuse_settings(old_node_info); diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index f6d4314391..c81a1a8292 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -4,7 +4,7 @@ use crate::{ console::messages::MetricsAuxInfo, metrics::NUM_BYTES_PROXIED_COUNTER, stream::Stream, - usage_metrics::{Ids, USAGE_METRICS}, + usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, }; use metrics::IntCounterPairGuard; use tokio::io::{AsyncRead, AsyncWrite}; @@ -19,8 +19,8 @@ pub async fn proxy_pass( aux: MetricsAuxInfo, ) -> anyhow::Result<()> { let usage = USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }); let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index a4051447c1..71d85e106d 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -12,11 +12,12 @@ use crate::auth::backend::{ }; use crate::config::CertResolver; use crate::console::caches::NodeInfoCache; +use crate::console::messages::MetricsAuxInfo; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; -use crate::{http, sasl, scram}; +use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId}; use anyhow::{bail, Context}; use async_trait::async_trait; use rstest::rstest; @@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism { fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { let node = NodeInfo { config: compute::ConnCfg::new(), - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, allow_self_signed_compute: false, }; let (_, node) = cache.insert("key".into(), node); diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f10779d7ba..8aa5ad4e8a 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -9,7 +9,6 @@ use crate::{ config::ProxyConfig, console::{ errors::{GetAuthInfoError, WakeComputeError}, - messages::ColdStartInfo, CachedNodeInfo, }, context::RequestMonitoring, @@ -57,7 +56,10 @@ impl PoolingBackend { let auth_outcome = crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; let res = match auth_outcome { - crate::sasl::Outcome::Success(key) => Ok(key), + crate::sasl::Outcome::Success(key) => { + info!("user successfully authenticated"); + Ok(key) + } crate::sasl::Outcome::Failure(reason) => { info!("auth backend failed with an error: {reason}"); Err(AuthError::auth_failed(&*conn_info.user_info.user)) @@ -89,8 +91,6 @@ impl PoolingBackend { }; if let Some(client) = maybe_client { - info!("cold_start_info=warm"); - ctx.set_cold_start_info(ColdStartInfo::Warm); return Ok(client); } let conn_id = uuid::Uuid::new_v4(); diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index c7e8eaef76..35311facb8 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -17,7 +17,7 @@ use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; -use crate::console::messages::MetricsAuxInfo; +use crate::console::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{ @@ -383,9 +383,12 @@ impl GlobalConnPool { "pid", &tracing::field::display(client.inner.get_process_id()), ); - info!("pool: reusing connection '{conn_info}'"); + info!( + cold_start_info = ColdStartInfo::HttpPoolHit.as_str(), + "pool: reusing connection '{conn_info}'" + ); client.session.send(ctx.session_id)?; - ctx.latency_timer.pool_hit(); + ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit); ctx.latency_timer.success(); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); } @@ -454,8 +457,9 @@ pub fn poll_client( let (tx, mut rx) = tokio::sync::watch::channel(session_id); let span = info_span!(parent: None, "connection", %conn_id); + let cold_start_info = ctx.cold_start_info; span.in_scope(|| { - info!(%conn_info, %session_id, "new connection"); + info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection"); }); let pool = match conn_info.endpoint_cache_key() { Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), @@ -565,8 +569,8 @@ impl Client { pub fn metrics(&self) -> Arc { let aux = &self.inner.as_ref().unwrap().aux; USAGE_METRICS.register(Ids { - endpoint_id: aux.endpoint_id.clone(), - branch_id: aux.branch_id.clone(), + endpoint_id: aux.endpoint_id, + branch_id: aux.branch_id, }) } } @@ -666,6 +670,8 @@ impl Drop for Client { mod tests { use std::{mem, sync::atomic::AtomicBool}; + use crate::{BranchId, EndpointId, ProjectId}; + use super::*; struct MockClient(Arc); @@ -691,7 +697,12 @@ mod tests { ClientInner { inner: client, session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), - aux: Default::default(), + aux: MetricsAuxInfo { + endpoint_id: (&EndpointId::from("endpoint")).into(), + project_id: (&ProjectId::from("project")).into(), + branch_id: (&BranchId::from("branch")).into(), + cold_start_info: crate::console::messages::ColdStartInfo::Warm, + }, conn_id: uuid::Uuid::new_v4(), } } diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f675375ff1..00dffd5784 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -42,12 +42,15 @@ use crate::error::ReportableError; use crate::error::UserFacingError; use crate::metrics::HTTP_CONTENT_LENGTH; use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::proxy::run_until_cancelled; use crate::proxy::NeonOptions; use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::MetricCounterRecorder; use crate::DbName; use crate::RoleName; use super::backend::PoolingBackend; +use super::conn_pool::Client; use super::conn_pool::ConnInfo; use super::json::json_to_pg_text; use super::json::pg_text_row_to_json; @@ -219,14 +222,7 @@ pub async fn handle( backend: Arc, cancel: CancellationToken, ) -> Result, ApiError> { - let cancel2 = cancel.clone(); - let handle = tokio::spawn(async move { - time::sleep(config.http_config.request_timeout).await; - cancel2.cancel(); - }); - let result = handle_inner(cancel, config, &mut ctx, request, backend).await; - handle.abort(); let mut response = match result { Ok(r) => { @@ -237,10 +233,7 @@ pub async fn handle( let error_kind = e.get_error_kind(); ctx.set_error_kind(error_kind); - let message = format!( - "Query cancelled, runtime exceeded. SQL queries over HTTP must not exceed {} seconds of runtime. Please consider using our websocket based connections", - config.http_config.request_timeout.as_secs_f64() - ); + let message = "Query cancelled, connection was terminated"; tracing::info!( kind=error_kind.to_metric_label(), @@ -434,6 +427,63 @@ impl ReportableError for SqlOverHttpCancel { } } +#[derive(Clone, Copy, Debug)] +struct HttpHeaders { + raw_output: bool, + default_array_mode: bool, + txn_isolation_level: Option, + txn_read_only: bool, + txn_deferrable: bool, +} + +impl HttpHeaders { + fn try_parse(headers: &hyper::http::HeaderMap) -> Result { + // Determine the output options. Default behaviour is 'false'. Anything that is not + // strictly 'true' assumed to be false. + let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); + let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); + + // isolation level, read only and deferrable + let txn_isolation_level = match headers.get(&TXN_ISOLATION_LEVEL) { + Some(x) => Some( + map_header_to_isolation_level(x).ok_or(SqlOverHttpError::InvalidIsolationLevel)?, + ), + None => None, + }; + + let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); + let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + + Ok(Self { + raw_output, + default_array_mode, + txn_isolation_level, + txn_read_only, + txn_deferrable, + }) + } +} + +fn map_header_to_isolation_level(level: &HeaderValue) -> Option { + match level.as_bytes() { + b"Serializable" => Some(IsolationLevel::Serializable), + b"ReadUncommitted" => Some(IsolationLevel::ReadUncommitted), + b"ReadCommitted" => Some(IsolationLevel::ReadCommitted), + b"RepeatableRead" => Some(IsolationLevel::RepeatableRead), + _ => None, + } +} + +fn map_isolation_level_to_headers(level: IsolationLevel) -> Option { + match level { + IsolationLevel::ReadUncommitted => Some(HeaderValue::from_static("ReadUncommitted")), + IsolationLevel::ReadCommitted => Some(HeaderValue::from_static("ReadCommitted")), + IsolationLevel::RepeatableRead => Some(HeaderValue::from_static("RepeatableRead")), + IsolationLevel::Serializable => Some(HeaderValue::from_static("Serializable")), + _ => None, + } +} + async fn handle_inner( cancel: CancellationToken, config: &'static ProxyConfig, @@ -450,43 +500,26 @@ async fn handle_inner( // Determine the destination and connection params // let headers = request.headers(); + // TLS config should be there. let conn_info = get_conn_info(ctx, headers, config.tls_config.as_ref().unwrap())?; info!(user = conn_info.user_info.user.as_str(), "credentials"); - // Determine the output options. Default behaviour is 'false'. Anything that is not - // strictly 'true' assumed to be false. - let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE); - let default_array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE); - // Allow connection pooling only if explicitly requested // or if we have decided that http pool is no longer opt-in let allow_pool = !config.http_config.pool_options.opt_in || headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); - // isolation level, read only and deferrable - - let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned(); - let txn_isolation_level = match txn_isolation_level_raw { - Some(ref x) => Some(match x.as_bytes() { - b"Serializable" => IsolationLevel::Serializable, - b"ReadUncommitted" => IsolationLevel::ReadUncommitted, - b"ReadCommitted" => IsolationLevel::ReadCommitted, - b"RepeatableRead" => IsolationLevel::RepeatableRead, - _ => return Err(SqlOverHttpError::InvalidIsolationLevel), - }), - None => None, - }; - - let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE); - let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE); + let parsed_headers = HttpHeaders::try_parse(headers)?; let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, }; info!(request_content_length, "request size in bytes"); - HTTP_CONTENT_LENGTH.observe(request_content_length as f64); + HTTP_CONTENT_LENGTH + .with_label_values(&["request"]) + .observe(request_content_length as f64); // we don't have a streaming request support yet so this is to prevent OOM // from a malicious user sending an extremely large request body @@ -514,20 +547,18 @@ async fn handle_inner( } .map_err(SqlOverHttpError::from); - // Run both operations in parallel - let (payload, mut client) = match select( + let (payload, mut client) = match run_until_cancelled( + // Run both operations in parallel try_join( pin!(fetch_and_process_request), pin!(authenticate_and_connect), ), - pin!(cancel.cancelled()), + &cancel, ) .await { - Either::Left((result, _cancelled)) => result?, - Either::Right((_cancelled, _)) => { - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)) - } + Some(result) => result?, + None => return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Connect)), }; let mut response = Response::builder() @@ -537,95 +568,143 @@ async fn handle_inner( // // Now execute the query and return the result // - let mut size = 0; let result = match payload { - Payload::Single(stmt) => { - let mut size = 0; - let (inner, mut discard) = client.inner(); - let cancel_token = inner.cancel_token(); - let query = pin!(query_to_json( - &*inner, - stmt, - &mut size, - raw_output, - default_array_mode - )); - let cancelled = pin!(cancel.cancelled()); - let res = select(query, cancelled).await; - match res { - Either::Left((Ok((status, results)), _cancelled)) => { - discard.check_idle(status); - results - } - Either::Left((Err(e), _cancelled)) => { - discard.discard(); - return Err(e); - } - Either::Right((_cancelled, query)) => { - if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); - } - match time::timeout(time::Duration::from_millis(100), query).await { - Ok(Ok((status, results))) => { - discard.check_idle(status); - results - } - Ok(Err(error)) => { - let db_error = match &error { - SqlOverHttpError::ConnectCompute( - HttpConnError::ConnectionError(e), - ) - | SqlOverHttpError::Postgres(e) => e.as_db_error(), - _ => None, - }; - - // if errored for some other reason, it might not be safe to return - if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { - discard.discard(); - } - - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - Err(_timeout) => { - discard.discard(); - return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)); - } - } - } - } - } + Payload::Single(stmt) => stmt.process(cancel, &mut client, parsed_headers).await?, Payload::Batch(statements) => { - info!("starting transaction"); - let (inner, mut discard) = client.inner(); - let cancel_token = inner.cancel_token(); - let mut builder = inner.build_transaction(); - if let Some(isolation_level) = txn_isolation_level { - builder = builder.isolation_level(isolation_level); + if parsed_headers.txn_read_only { + response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); } - if txn_read_only { - builder = builder.read_only(true); + if parsed_headers.txn_deferrable { + response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); } - if txn_deferrable { - builder = builder.deferrable(true); - } - - let transaction = builder.start().await.map_err(|e| { - // if we cannot start a transaction, we should return immediately - // and not return to the pool. connection is clearly broken - discard.discard(); - e - })?; - - let results = match query_batch( - cancel.child_token(), - &transaction, - statements, - &mut size, - raw_output, - default_array_mode, - ) - .await + if let Some(txn_isolation_level) = parsed_headers + .txn_isolation_level + .and_then(map_isolation_level_to_headers) { + response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); + } + + statements + .process(cancel, &mut client, parsed_headers) + .await? + } + }; + + let metrics = client.metrics(); + + // how could this possibly fail + let body = serde_json::to_string(&result).expect("json serialization should not fail"); + let len = body.len(); + let response = response + .body(Body::from(body)) + // only fails if invalid status code or invalid header/values are given. + // these are not user configurable so it cannot fail dynamically + .expect("building response payload should not fail"); + + // count the egress bytes - we miss the TLS and header overhead but oh well... + // moving this later in the stack is going to be a lot of effort and ehhhh + metrics.record_egress(len as u64); + HTTP_CONTENT_LENGTH + .with_label_values(&["response"]) + .observe(len as f64); + + Ok(response) +} + +impl QueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + + let res = match select( + pin!(query_to_json(&*inner, self, &mut 0, parsed_headers)), + pin!(cancel.cancelled()), + ) + .await + { + // The query successfully completed. + Either::Left((Ok((status, results)), __not_yet_cancelled)) => { + discard.check_idle(status); + Ok(results) + } + // The query failed with an error + Either::Left((Err(e), __not_yet_cancelled)) => { + discard.discard(); + return Err(e); + } + // The query was cancelled. + Either::Right((_cancelled, query)) => { + if let Err(err) = cancel_token.cancel_query(NoTls).await { + tracing::error!(?err, "could not cancel query"); + } + // wait for the query cancellation + match time::timeout(time::Duration::from_millis(100), query).await { + // query successed before it was cancelled. + Ok(Ok((status, results))) => { + discard.check_idle(status); + Ok(results) + } + // query failed or was cancelled. + Ok(Err(error)) => { + let db_error = match &error { + SqlOverHttpError::ConnectCompute(HttpConnError::ConnectionError(e)) + | SqlOverHttpError::Postgres(e) => e.as_db_error(), + _ => None, + }; + + // if errored for some other reason, it might not be safe to return + if !db_error.is_some_and(|e| *e.code() == SqlState::QUERY_CANCELED) { + discard.discard(); + } + + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + Err(_timeout) => { + discard.discard(); + Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres)) + } + } + } + }; + res + } +} + +impl BatchQueryData { + async fn process( + self, + cancel: CancellationToken, + client: &mut Client, + parsed_headers: HttpHeaders, + ) -> Result { + info!("starting transaction"); + let (inner, mut discard) = client.inner(); + let cancel_token = inner.cancel_token(); + let mut builder = inner.build_transaction(); + if let Some(isolation_level) = parsed_headers.txn_isolation_level { + builder = builder.isolation_level(isolation_level); + } + if parsed_headers.txn_read_only { + builder = builder.read_only(true); + } + if parsed_headers.txn_deferrable { + builder = builder.deferrable(true); + } + + let transaction = builder.start().await.map_err(|e| { + // if we cannot start a transaction, we should return immediately + // and not return to the pool. connection is clearly broken + discard.discard(); + e + })?; + + let results = + match query_batch(cancel.child_token(), &transaction, self, parsed_headers).await { Ok(results) => { info!("commit"); let status = transaction.commit().await.map_err(|e| { @@ -659,44 +738,15 @@ async fn handle_inner( } }; - if txn_read_only { - response = response.header(TXN_READ_ONLY.clone(), &HEADER_VALUE_TRUE); - } - if txn_deferrable { - response = response.header(TXN_DEFERRABLE.clone(), &HEADER_VALUE_TRUE); - } - if let Some(txn_isolation_level) = txn_isolation_level_raw { - response = response.header(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level); - } - json!({ "results": results }) - } - }; - - let metrics = client.metrics(); - - // how could this possibly fail - let body = serde_json::to_string(&result).expect("json serialization should not fail"); - let len = body.len(); - let response = response - .body(Body::from(body)) - // only fails if invalid status code or invalid header/values are given. - // these are not user configurable so it cannot fail dynamically - .expect("building response payload should not fail"); - - // count the egress bytes - we miss the TLS and header overhead but oh well... - // moving this later in the stack is going to be a lot of effort and ehhhh - metrics.record_egress(len as u64); - - Ok(response) + Ok(json!({ "results": results })) + } } async fn query_batch( cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, - total_size: &mut usize, - raw_output: bool, - array_mode: bool, + parsed_headers: HttpHeaders, ) -> Result, SqlOverHttpError> { let mut results = Vec::with_capacity(queries.queries.len()); let mut current_size = 0; @@ -705,8 +755,7 @@ async fn query_batch( transaction, stmt, &mut current_size, - raw_output, - array_mode + parsed_headers, )); let cancelled = pin!(cancel.cancelled()); let res = select(query, cancelled).await; @@ -723,7 +772,6 @@ async fn query_batch( } } } - *total_size += current_size; Ok(results) } @@ -731,8 +779,7 @@ async fn query_to_json( client: &T, data: QueryData, current_size: &mut usize, - raw_output: bool, - default_array_mode: bool, + parsed_headers: HttpHeaders, ) -> Result<(ReadyForQueryStatus, Value), SqlOverHttpError> { info!("executing query"); let query_params = data.params; @@ -792,12 +839,12 @@ async fn query_to_json( columns.push(client.get_type(c.type_oid()).await?); } - let array_mode = data.array_mode.unwrap_or(default_array_mode); + let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode); // convert rows to JSON let rows = rows .iter() - .map(|row| pg_text_row_to_json(row, &columns, raw_output, array_mode)) + .map(|row| pg_text_row_to_json(row, &columns, parsed_headers.raw_output, array_mode)) .collect::, _>>()?; // resulting JSON format is based on the format of node-postgres result diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index d75aedf89b..5ffbf95c07 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,20 +1,35 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{config::MetricCollectionConfig, http, BranchId, EndpointId}; -use chrono::{DateTime, Utc}; +use crate::{ + config::{MetricBackupCollectionConfig, MetricCollectionConfig}, + context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, + http, + intern::{BranchIdInt, EndpointIdInt}, +}; +use anyhow::Context; +use async_compression::tokio::write::GzipEncoder; +use bytes::Bytes; +use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use dashmap::{mapref::entry::Entry, DashMap}; +use futures::future::select; use once_cell::sync::Lazy; +use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; use std::{ convert::Infallible, + pin::pin, sync::{ atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::Duration, }; +use tokio::io::AsyncWriteExt; +use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace}; +use utils::backoff; +use uuid::{NoContext, Timestamp}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; @@ -29,23 +44,97 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// because we enrich the event with project_id in the control-plane endpoint. #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] pub struct Ids { - pub endpoint_id: EndpointId, - pub branch_id: BranchId, + pub endpoint_id: EndpointIdInt, + pub branch_id: BranchIdInt, +} + +pub trait MetricCounterRecorder { + /// Record that some bytes were sent from the proxy to the client + fn record_egress(&self, bytes: u64); + /// Record that some connections were opened + fn record_connection(&self, count: usize); +} + +trait MetricCounterReporter { + fn get_metrics(&mut self) -> (u64, usize); + fn move_metrics(&self) -> (u64, usize); +} + +#[derive(Debug)] +struct MetricBackupCounter { + transmitted: AtomicU64, + opened_connections: AtomicUsize, +} + +impl MetricCounterRecorder for MetricBackupCounter { + fn record_egress(&self, bytes: u64) { + self.transmitted.fetch_add(bytes, Ordering::AcqRel); + } + + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + } +} + +impl MetricCounterReporter for MetricBackupCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } } #[derive(Debug)] pub struct MetricCounter { transmitted: AtomicU64, opened_connections: AtomicUsize, + backup: Arc, } -impl MetricCounter { +impl MetricCounterRecorder for MetricCounter { /// Record that some bytes were sent from the proxy to the client - pub fn record_egress(&self, bytes: u64) { + fn record_egress(&self, bytes: u64) { self.transmitted.fetch_add(bytes, Ordering::AcqRel); + self.backup.record_egress(bytes); } + /// Record that some connections were opened + fn record_connection(&self, count: usize) { + self.opened_connections.fetch_add(count, Ordering::AcqRel); + self.backup.record_connection(count); + } +} + +impl MetricCounterReporter for MetricCounter { + fn get_metrics(&mut self) -> (u64, usize) { + ( + *self.transmitted.get_mut(), + *self.opened_connections.get_mut(), + ) + } + fn move_metrics(&self) -> (u64, usize) { + ( + self.transmitted.swap(0, Ordering::AcqRel), + self.opened_connections.swap(0, Ordering::AcqRel), + ) + } +} + +trait Clearable { /// extract the value that should be reported + fn should_report(self: &Arc) -> Option; + /// Determine whether the counter should be cleared from the global map. + fn should_clear(self: &mut Arc) -> bool; +} + +impl Clearable for C { fn should_report(self: &Arc) -> Option { // heuristic to see if the branch is still open // if a clone happens while we are observing, the heuristic will be incorrect. @@ -54,13 +143,12 @@ impl MetricCounter { // However, for the strong count to be 1 it must have occured that at one instant // all the endpoints were closed, so missing a report because the endpoints are closed is valid. let is_open = Arc::strong_count(self) > 1; - let opened = self.opened_connections.swap(0, Ordering::AcqRel); // update cached metrics eagerly, even if they can't get sent // (to avoid sending the same metrics twice) // see the relevant discussion on why to do so even if the status is not success: // https://github.com/neondatabase/neon/pull/4563#discussion_r1246710956 - let value = self.transmitted.swap(0, Ordering::AcqRel); + let (value, opened) = self.move_metrics(); // Our only requirement is that we report in every interval if there was an open connection // if there were no opened connections since, then we don't need to report @@ -70,15 +158,12 @@ impl MetricCounter { Some(value) } } - - /// Determine whether the counter should be cleared from the global map. fn should_clear(self: &mut Arc) -> bool { // we can't clear this entry if it's acquired elsewhere let Some(counter) = Arc::get_mut(self) else { return false; }; - let opened = *counter.opened_connections.get_mut(); - let value = *counter.transmitted.get_mut(); + let (opened, value) = counter.get_metrics(); // clear if there's no data to report value == 0 && opened == 0 } @@ -90,11 +175,26 @@ type FastHasher = std::hash::BuildHasherDefault; #[derive(Default)] pub struct Metrics { endpoints: DashMap, FastHasher>, + backup_endpoints: DashMap, FastHasher>, } impl Metrics { /// Register a new byte metrics counter for this endpoint pub fn register(&self, ids: Ids) -> Arc { + let backup = if let Some(entry) = self.backup_endpoints.get(&ids) { + entry.clone() + } else { + self.backup_endpoints + .entry(ids.clone()) + .or_insert_with(|| { + Arc::new(MetricBackupCounter { + transmitted: AtomicU64::new(0), + opened_connections: AtomicUsize::new(0), + }) + }) + .clone() + }; + let entry = if let Some(entry) = self.endpoints.get(&ids) { entry.clone() } else { @@ -104,12 +204,13 @@ impl Metrics { Arc::new(MetricCounter { transmitted: AtomicU64::new(0), opened_connections: AtomicUsize::new(0), + backup: backup.clone(), }) }) .clone() }; - entry.opened_connections.fetch_add(1, Ordering::AcqRel); + entry.record_connection(1); entry } } @@ -132,7 +233,7 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result, - now: DateTime, -) { - info!( - "starting collect_metrics_iteration. metric_collection_endpoint: {}", - metric_collection_endpoint - ); - +fn collect_and_clear_metrics( + endpoints: &DashMap, FastHasher>, +) -> Vec<(Ids, u64)> { let mut metrics_to_clear = Vec::new(); - let metrics_to_send: Vec<(Ids, u64)> = metrics - .endpoints + let metrics_to_send: Vec<(Ids, u64)> = endpoints .iter() .filter_map(|counter| { let key = counter.key().clone(); @@ -173,33 +262,71 @@ async fn collect_metrics_iteration( }) .collect(); + for metric in metrics_to_clear { + match endpoints.entry(metric) { + Entry::Occupied(mut counter) => { + if counter.get_mut().should_clear() { + counter.remove_entry(); + } + } + Entry::Vacant(_) => {} + } + } + metrics_to_send +} + +fn create_event_chunks<'a>( + metrics_to_send: &'a [(Ids, u64)], + hostname: &'a str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) -> impl Iterator>> + 'a { + // Split into chunks of 1000 metrics to avoid exceeding the max request size + metrics_to_send + .chunks(chunk_size) + .map(move |chunk| EventChunk { + events: chunk + .iter() + .map(|(ids, value)| Event { + kind: EventType::Incremental { + start_time: prev, + stop_time: now, + }, + metric: PROXY_IO_BYTES_PER_CLIENT, + idempotency_key: idempotency_key(hostname), + value: *value, + extra: ids.clone(), + }) + .collect(), + }) +} + +#[instrument(skip_all)] +async fn collect_metrics_iteration( + endpoints: &DashMap, FastHasher>, + client: &http::ClientWithMiddleware, + metric_collection_endpoint: &reqwest::Url, + hostname: &str, + prev: DateTime, + now: DateTime, +) { + info!( + "starting collect_metrics_iteration. metric_collection_endpoint: {}", + metric_collection_endpoint + ); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + if metrics_to_send.is_empty() { trace!("no new metrics to send"); } // Send metrics. - // Split into chunks of 1000 metrics to avoid exceeding the max request size - for chunk in metrics_to_send.chunks(CHUNK_SIZE) { - let events = chunk - .iter() - .map(|(ids, value)| Event { - kind: EventType::Incremental { - start_time: prev, - stop_time: now, - }, - metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname), - value: *value, - extra: Ids { - endpoint_id: ids.endpoint_id.clone(), - branch_id: ids.branch_id.clone(), - }, - }) - .collect(); - + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) { let res = client .post(metric_collection_endpoint.clone()) - .json(&EventChunk { events }) + .json(&chunk) .send() .await; @@ -213,23 +340,142 @@ async fn collect_metrics_iteration( if !res.status().is_success() { error!("metrics endpoint refused the sent metrics: {:?}", res); - for metric in chunk.iter().filter(|(_, value)| *value > (1u64 << 40)) { + for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large error!("potentially abnormal metric value: {:?}", metric); } } } +} - for metric in metrics_to_clear { - match metrics.endpoints.entry(metric) { - Entry::Occupied(mut counter) => { - if counter.get_mut().should_clear() { - counter.remove_entry(); - } - } - Entry::Vacant(_) => {} +pub async fn task_backup( + backup_config: &MetricBackupCollectionConfig, + cancellation_token: CancellationToken, +) -> anyhow::Result<()> { + info!("metrics backup config: {backup_config:?}"); + scopeguard::defer! { + info!("metrics backup has shut down"); + } + // Even if the remote storage is not configured, we still want to clear the metrics. + let storage = backup_config + .remote_storage_config + .as_ref() + .map(|config| GenericRemoteStorage::from_config(config).context("remote storage init")) + .transpose()?; + let mut ticker = tokio::time::interval(backup_config.interval); + let mut prev = Utc::now(); + let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned(); + loop { + select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await; + let now = Utc::now(); + collect_metrics_backup_iteration( + &USAGE_METRICS.backup_endpoints, + &storage, + &hostname, + prev, + now, + backup_config.chunk_size, + ) + .await; + + prev = now; + if cancellation_token.is_cancelled() { + info!("metrics backup has been cancelled"); + break; } } + Ok(()) +} + +#[instrument(skip_all)] +async fn collect_metrics_backup_iteration( + endpoints: &DashMap, FastHasher>, + storage: &Option, + hostname: &str, + prev: DateTime, + now: DateTime, + chunk_size: usize, +) { + let year = now.year(); + let month = now.month(); + let day = now.day(); + let hour = now.hour(); + let minute = now.minute(); + let second = now.second(); + let cancel = CancellationToken::new(); + + info!("starting collect_metrics_backup_iteration"); + + let metrics_to_send = collect_and_clear_metrics(endpoints); + + if metrics_to_send.is_empty() { + trace!("no new metrics to send"); + } + + // Send metrics. + for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) { + let real_now = Utc::now(); + let id = uuid::Uuid::new_v7(Timestamp::from_unix( + NoContext, + real_now.second().into(), + real_now.nanosecond(), + )); + let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz"); + let remote_path = match RemotePath::from_string(&path) { + Ok(remote_path) => remote_path, + Err(e) => { + error!("failed to create remote path from str {path}: {:?}", e); + continue; + } + }; + + let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await; + + if let Err(e) = res { + error!( + "failed to upload consumption events to remote storage: {:?}", + e + ); + } + } +} + +async fn upload_events_chunk( + storage: &Option, + chunk: EventChunk<'_, Event>, + remote_path: &RemotePath, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let storage = match storage { + Some(storage) => storage, + None => { + error!("no remote storage configured"); + return Ok(()); + } + }; + let data = serde_json::to_vec(&chunk).context("serialize metrics")?; + let mut encoder = GzipEncoder::new(Vec::new()); + encoder.write_all(&data).await.context("compress metrics")?; + encoder.shutdown().await.context("compress metrics")?; + let compressed_data: Bytes = encoder.get_ref().clone().into(); + backoff::retry( + || async { + let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone()))); + storage + .upload(stream, compressed_data.len(), remote_path, None, cancel) + .await + }, + TimeoutOrCancel::caused_by_cancel, + FAILED_UPLOAD_WARN_THRESHOLD, + FAILED_UPLOAD_MAX_RETRIES, + "request_data_upload", + cancel, + ) + .await + .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel)) + .and_then(|x| x) + .context("request_data_upload")?; + Ok(()) } #[cfg(test)] @@ -248,8 +494,8 @@ mod tests { }; use url::Url; - use super::{collect_metrics_iteration, Ids, Metrics}; - use crate::{http, rate_limiter::RateLimiterConfig}; + use super::*; + use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId}; #[tokio::test] async fn metrics() { @@ -284,18 +530,19 @@ mod tests { let now = Utc::now(); // no counters have been registered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // register a new counter + let counter = metrics.register(Ids { - endpoint_id: "e1".into(), - branch_id: "b1".into(), + endpoint_id: (&EndpointId::from("e1")).into(), + branch_id: (&BranchId::from("b1")).into(), }); // the counter should be observed despite 0 egress - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -305,7 +552,7 @@ mod tests { counter.record_egress(1); // egress should be observered - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert_eq!(r.len(), 1); assert_eq!(r[0].events.len(), 1); @@ -315,11 +562,19 @@ mod tests { drop(counter); // we do not observe the counter - collect_metrics_iteration(&metrics, &client, &endpoint, "foo", now, now).await; + collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await; let r = std::mem::take(&mut *reports2.lock().unwrap()); assert!(r.is_empty()); // counter is unregistered assert!(metrics.endpoints.is_empty()); + + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + assert!(!metrics.backup_endpoints.is_empty()); + collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000) + .await; + // backup counter is unregistered after the second iteration + assert!(metrics.backup_endpoints.is_empty()); } } diff --git a/pyproject.toml b/pyproject.toml index e347d47cbf..156f135062 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,4 +94,5 @@ select = [ "I", # isort "W", # pycodestyle "B", # bugbear + "UP032", # f-string ] diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index cb4a1def1f..c8b732fee1 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -33,6 +33,7 @@ once_cell.workspace = true parking_lot.workspace = true postgres.workspace = true postgres-protocol.workspace = true +rand.workspace = true regex.workspace = true scopeguard.workspace = true reqwest = { workspace = true, features = ["json"] } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 3c4c81e499..e53ccaeb3d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -28,7 +28,7 @@ use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::defaults::{ DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, - DEFAULT_PG_LISTEN_ADDR, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; use safekeeper::wal_service; use safekeeper::GlobalTimelines; @@ -170,6 +170,13 @@ struct Args { /// still needed for existing replication connection. #[arg(long)] walsenders_keep_horizon: bool, + /// Enable partial backup. If disabled, safekeeper will not upload partial + /// segments to remote storage. + #[arg(long)] + partial_backup_enabled: bool, + /// Controls how long backup will wait until uploading the partial segment. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_PARTIAL_BACKUP_TIMEOUT, verbatim_doc_comment)] + partial_backup_timeout: Duration, } // Like PathBufValueParser, but allows empty string. @@ -300,6 +307,8 @@ async fn main() -> anyhow::Result<()> { http_auth, current_thread_runtime: args.current_thread_runtime, walsenders_keep_horizon: args.walsenders_keep_horizon, + partial_backup_enabled: args.partial_backup_enabled, + partial_backup_timeout: args.partial_backup_timeout, }; // initialize sentry if SENTRY_DSN is provided @@ -365,6 +374,8 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); + wal_backup::init_remote_storage(&conf); + // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index d822c87c0e..fe9f2e6899 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -20,7 +20,7 @@ use utils::{bin_ser::LeSer, id::TenantTimelineId}; use crate::SafeKeeperConf; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 7; +pub const SK_FORMAT_VERSION: u32 = 8; // contains persistent metadata for safekeeper const CONTROL_FILE_NAME: &str = "safekeeper.control"; diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs index 2fd719326d..8f4dfe9b43 100644 --- a/safekeeper/src/control_file_upgrade.rs +++ b/safekeeper/src/control_file_upgrade.rs @@ -2,6 +2,7 @@ use crate::{ safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn}, state::{PersistedPeers, TimelinePersistentState}, + wal_backup_partial, }; use anyhow::{bail, Result}; use pq_proto::SystemId; @@ -138,6 +139,50 @@ pub struct SafeKeeperStateV4 { pub peers: PersistedPeers, } +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct SafeKeeperStateV7 { + #[serde(with = "hex")] + pub tenant_id: TenantId, + #[serde(with = "hex")] + pub timeline_id: TimelineId, + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfo, + /// Unique id of the last *elected* proposer we dealt with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// Since which LSN this timeline generally starts. Safekeeper might have + /// joined later. + pub timeline_start_lsn: Lsn, + /// Since which LSN safekeeper has (had) WAL for this timeline. + /// All WAL segments next to one containing local_start_lsn are + /// filled with data from the beginning. + pub local_start_lsn: Lsn, + /// Part of WAL acknowledged by quorum *and available locally*. Always points + /// to record boundary. + pub commit_lsn: Lsn, + /// LSN that points to the end of the last backed up segment. Useful to + /// persist to avoid finding out offloading progress on boot. + pub backup_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. Updates are currently drived + /// only by walproposer. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver (or broker). + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: PersistedPeers, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -167,6 +212,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result Result Result Result Result>, pub current_thread_runtime: bool, pub walsenders_keep_horizon: bool, + pub partial_backup_enabled: bool, + pub partial_backup_timeout: Duration, } impl SafeKeeperConf { @@ -123,6 +127,8 @@ impl SafeKeeperConf { max_offloader_lag_bytes: defaults::DEFAULT_MAX_OFFLOADER_LAG_BYTES, current_thread_runtime: false, walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index e541527b6a..28ae042bb3 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -147,6 +147,21 @@ pub static RECEIVED_PS_FEEDBACKS: Lazy = Lazy::new(|| { ) .expect("Failed to register safekeeper_received_ps_feedbacks_total counter") }); +pub static PARTIAL_BACKUP_UPLOADS: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "safekeeper_partial_backup_uploads_total", + "Number of partial backup uploads to the S3", + &["result"] + ) + .expect("Failed to register safekeeper_partial_backup_uploads_total counter") +}); +pub static PARTIAL_BACKUP_UPLOADED_BYTES: Lazy = Lazy::new(|| { + register_int_counter!( + "safekeeper_partial_backup_uploaded_bytes_total", + "Number of bytes uploaded to the S3 during partial backup" + ) + .expect("Failed to register safekeeper_partial_backup_uploaded_bytes_total counter") +}); pub const LABEL_UNKNOWN: &str = "unknown"; diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d7c8fa6955..f2ee0403eb 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -1221,6 +1221,7 @@ mod tests { commit_lsn: Lsn(1234567600), }, )]), + partial_backup: crate::wal_backup_partial::State::default(), }; let ser = state.ser().unwrap(); @@ -1266,6 +1267,8 @@ mod tests { 0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, 0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00, + // partial_backup + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, ]; assert_eq!(Hex(&ser), Hex(&expected)); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index 82f7954051..be5e516296 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -13,6 +13,7 @@ use utils::{ use crate::{ control_file, safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, TermHistory}, + wal_backup_partial::{self}, }; /// Persistent information stored on safekeeper node about timeline. @@ -54,11 +55,14 @@ pub struct TimelinePersistentState { /// pushed to s3. We don't remove WAL beyond it. Persisted only for /// informational purposes, we receive it from pageserver (or broker). pub remote_consistent_lsn: Lsn, - // Peers and their state as we remember it. Knowing peers themselves is - // fundamental; but state is saved here only for informational purposes and - // obviously can be stale. (Currently not saved at all, but let's provision - // place to have less file version upgrades). + /// Peers and their state as we remember it. Knowing peers themselves is + /// fundamental; but state is saved here only for informational purposes and + /// obviously can be stale. (Currently not saved at all, but let's provision + /// place to have less file version upgrades). pub peers: PersistedPeers, + /// Holds names of partial segments uploaded to remote storage. Used to + /// clean up old objects without leaving garbage in remote storage. + pub partial_backup: wal_backup_partial::State, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -93,6 +97,7 @@ impl TimelinePersistentState { .map(|p| (*p, PersistedPeerInfo::new())) .collect(), ), + partial_backup: wal_backup_partial::State::default(), } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 4901b86acf..64f764f191 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -38,7 +38,7 @@ use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; use crate::metrics::FullTimelineInfo; use crate::wal_storage::Storage as wal_storage_iface; -use crate::{debug_dump, wal_storage}; +use crate::{debug_dump, wal_backup_partial, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; /// Things safekeeper should know about timeline state on peers. @@ -503,6 +503,9 @@ impl Timeline { if conf.peer_recovery_enabled { tokio::spawn(recovery_main(self.clone(), conf.clone())); } + if conf.is_wal_backup_enabled() && conf.partial_backup_enabled { + tokio::spawn(wal_backup_partial::main_task(self.clone(), conf.clone())); + } } /// Delete timeline from disk completely, by removing timeline directory. @@ -667,8 +670,8 @@ impl Timeline { term_flush_lsn = TermLsn::from((shared_state.sk.get_term(), shared_state.sk.flush_lsn())); } - self.commit_lsn_watch_tx.send(commit_lsn)?; self.term_flush_lsn_watch_tx.send(term_flush_lsn)?; + self.commit_lsn_watch_tx.send(commit_lsn)?; Ok(rmsg) } diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 944d80f777..e3f6a606a0 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -18,7 +18,7 @@ use std::time::Duration; use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr; use postgres_ffi::XLogFileName; use postgres_ffi::{XLogSegNo, PG_TLI}; -use remote_storage::{GenericRemoteStorage, RemotePath}; +use remote_storage::{GenericRemoteStorage, RemotePath, StorageMetadata}; use tokio::fs::File; use tokio::select; @@ -180,6 +180,16 @@ fn get_configured_remote_storage() -> &'static GenericRemoteStorage { .unwrap() } +pub fn init_remote_storage(conf: &SafeKeeperConf) { + // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide + // dependencies to all tasks instead. + REMOTE_STORAGE.get_or_init(|| { + conf.remote_storage + .as_ref() + .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) + }); +} + const CHECK_TASKS_INTERVAL_MSEC: u64 = 1000; /// Sits on wal_backup_launcher_rx and starts/stops per timeline wal backup @@ -194,14 +204,6 @@ pub async fn wal_backup_launcher_task_main( conf.remote_storage ); - let conf_ = conf.clone(); - REMOTE_STORAGE.get_or_init(|| { - conf_ - .remote_storage - .as_ref() - .map(|c| GenericRemoteStorage::from_config(c).expect("failed to create remote storage")) - }); - // Presence in this map means launcher is aware s3 offloading is needed for // the timeline, but task is started only if it makes sense for to offload // from this safekeeper. @@ -518,6 +520,35 @@ async fn backup_object( .await } +pub(crate) async fn backup_partial_segment( + source_file: &Utf8Path, + target_file: &RemotePath, + size: usize, +) -> Result<()> { + let storage = get_configured_remote_storage(); + + let file = File::open(&source_file) + .await + .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; + + // limiting the file to read only the first `size` bytes + let limited_file = tokio::io::AsyncReadExt::take(file, size as u64); + + let file = tokio_util::io::ReaderStream::with_capacity(limited_file, BUFFER_SIZE); + + let cancel = CancellationToken::new(); + + storage + .upload( + file, + size, + target_file, + Some(StorageMetadata::from([("sk_type", "partial_segment")])), + &cancel, + ) + .await +} + pub async fn read_object( file_path: &RemotePath, offset: u64, @@ -604,6 +635,13 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { Ok(()) } +/// Used by wal_backup_partial. +pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { + let cancel = CancellationToken::new(); // not really used + let storage = get_configured_remote_storage(); + storage.delete_objects(paths, &cancel).await +} + /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( wal_seg_size: usize, diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs new file mode 100644 index 0000000000..200096ac5c --- /dev/null +++ b/safekeeper/src/wal_backup_partial.rs @@ -0,0 +1,407 @@ +//! Safekeeper timeline has a background task which is subscribed to `commit_lsn` +//! and `flush_lsn` updates. After the partial segment was updated (`flush_lsn` +//! was changed), the segment will be uploaded to S3 in about 15 minutes. +//! +//! The filename format for partial segments is +//! `Segment_Term_Flush_Commit_skNN.partial`, where: +//! - `Segment` – the segment name, like `000000010000000000000001` +//! - `Term` – current term +//! - `Flush` – flush_lsn in hex format `{:016X}`, e.g. `00000000346BC568` +//! - `Commit` – commit_lsn in the same hex format +//! - `NN` – safekeeper_id, like `1` +//! +//! The full object name example: +//! `000000010000000000000002_2_0000000002534868_0000000002534410_sk1.partial` +//! +//! Each safekeeper will keep info about remote partial segments in its control +//! file. Code updates state in the control file before doing any S3 operations. +//! This way control file stores information about all potentially existing +//! remote partial segments and can clean them up after uploading a newer version. + +use std::sync::Arc; + +use camino::Utf8PathBuf; +use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; +use rand::Rng; +use remote_storage::RemotePath; +use serde::{Deserialize, Serialize}; + +use tracing::{debug, error, info, instrument}; +use utils::lsn::Lsn; + +use crate::{ + metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + safekeeper::Term, + timeline::Timeline, + wal_backup, SafeKeeperConf, +}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum UploadStatus { + /// Upload is in progress + InProgress, + /// Upload is finished + Uploaded, + /// Deletion is in progress + Deleting, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub struct PartialRemoteSegment { + pub status: UploadStatus, + pub name: String, + pub commit_lsn: Lsn, + pub flush_lsn: Lsn, + pub term: Term, +} + +impl PartialRemoteSegment { + fn eq_without_status(&self, other: &Self) -> bool { + self.name == other.name + && self.commit_lsn == other.commit_lsn + && self.flush_lsn == other.flush_lsn + && self.term == other.term + } +} + +// NB: these structures are a part of a control_file, you can't change them without +// changing the control file format version. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +pub struct State { + pub segments: Vec, +} + +impl State { + /// Find an Uploaded segment. There should be only one Uploaded segment at a time. + fn uploaded_segment(&self) -> Option { + self.segments + .iter() + .find(|seg| seg.status == UploadStatus::Uploaded) + .cloned() + } +} + +struct PartialBackup { + wal_seg_size: usize, + tli: Arc, + conf: SafeKeeperConf, + local_prefix: Utf8PathBuf, + remote_prefix: Utf8PathBuf, + + state: State, +} + +// Read-only methods for getting segment names +impl PartialBackup { + fn segno(&self, lsn: Lsn) -> XLogSegNo { + lsn.segment_number(self.wal_seg_size) + } + + fn segment_name(&self, segno: u64) -> String { + XLogFileName(PG_TLI, segno, self.wal_seg_size) + } + + fn remote_segment_name( + &self, + segno: u64, + term: u64, + commit_lsn: Lsn, + flush_lsn: Lsn, + ) -> String { + format!( + "{}_{}_{:016X}_{:016X}_sk{}.partial", + self.segment_name(segno), + term, + flush_lsn.0, + commit_lsn.0, + self.conf.my_id.0, + ) + } + + fn local_segment_name(&self, segno: u64) -> String { + format!("{}.partial", self.segment_name(segno)) + } +} + +impl PartialBackup { + /// Takes a lock to read actual safekeeper state and returns a segment that should be uploaded. + async fn prepare_upload(&self) -> PartialRemoteSegment { + // this operation takes a lock to get the actual state + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + let flush_lsn = Lsn(sk_info.flush_lsn); + let commit_lsn = Lsn(sk_info.commit_lsn); + let term = sk_info.term; + let segno = self.segno(flush_lsn); + + let name = self.remote_segment_name(segno, term, commit_lsn, flush_lsn); + + PartialRemoteSegment { + status: UploadStatus::InProgress, + name, + commit_lsn, + flush_lsn, + term, + } + } + + /// Reads segment from disk and uploads it to the remote storage. + async fn upload_segment(&mut self, prepared: PartialRemoteSegment) -> anyhow::Result<()> { + let flush_lsn = prepared.flush_lsn; + let segno = self.segno(flush_lsn); + + // We're going to backup bytes from the start of the segment up to flush_lsn. + let backup_bytes = flush_lsn.segment_offset(self.wal_seg_size); + + let local_path = self.local_prefix.join(self.local_segment_name(segno)); + let remote_path = RemotePath::new(self.remote_prefix.join(&prepared.name).as_ref())?; + + // Upload first `backup_bytes` bytes of the segment to the remote storage. + wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); + + // We uploaded the segment, now let's verify that the data is still actual. + // If the term changed, we cannot guarantee the validity of the uploaded data. + // If the term is the same, we know the data is not corrupted. + let sk_info = self.tli.get_safekeeper_info(&self.conf).await; + if sk_info.term != prepared.term { + anyhow::bail!("term changed during upload"); + } + assert!(prepared.commit_lsn <= Lsn(sk_info.commit_lsn)); + assert!(prepared.flush_lsn <= Lsn(sk_info.flush_lsn)); + + Ok(()) + } + + /// Write new state to disk. If in-memory and on-disk states diverged, returns an error. + async fn commit_state(&mut self, new_state: State) -> anyhow::Result<()> { + self.tli + .map_control_file(|cf| { + if cf.partial_backup != self.state { + let memory = self.state.clone(); + self.state = cf.partial_backup.clone(); + anyhow::bail!( + "partial backup state diverged, memory={:?}, disk={:?}", + memory, + cf.partial_backup + ); + } + + cf.partial_backup = new_state.clone(); + Ok(()) + }) + .await?; + // update in-memory state + self.state = new_state; + Ok(()) + } + + /// Upload the latest version of the partial segment and garbage collect older versions. + #[instrument(name = "upload", skip_all, fields(name = %prepared.name))] + async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> { + info!("starting upload {:?}", prepared); + + let state_0 = self.state.clone(); + let state_1 = { + let mut state = state_0.clone(); + state.segments.push(prepared.clone()); + state + }; + + // we're going to upload a new segment, let's write it to disk to make GC later + self.commit_state(state_1).await?; + + self.upload_segment(prepared.clone()).await?; + + let state_2 = { + let mut state = state_0.clone(); + for seg in state.segments.iter_mut() { + seg.status = UploadStatus::Deleting; + } + let mut actual_remote_segment = prepared.clone(); + actual_remote_segment.status = UploadStatus::Uploaded; + state.segments.push(actual_remote_segment); + state + }; + + // we've uploaded new segment, it's actual, all other segments should be GCed + self.commit_state(state_2).await?; + self.gc().await?; + + Ok(()) + } + + /// Delete all non-Uploaded segments from the remote storage. There should be only one + /// Uploaded segment at a time. + #[instrument(name = "gc", skip_all)] + async fn gc(&mut self) -> anyhow::Result<()> { + let mut segments_to_delete = vec![]; + + let new_segments: Vec = self + .state + .segments + .iter() + .filter_map(|seg| { + if seg.status == UploadStatus::Uploaded { + Some(seg.clone()) + } else { + segments_to_delete.push(seg.name.clone()); + None + } + }) + .collect(); + + info!("deleting objects: {:?}", segments_to_delete); + let mut objects_to_delete = vec![]; + for seg in segments_to_delete.iter() { + let remote_path = RemotePath::new(self.remote_prefix.join(seg).as_ref())?; + objects_to_delete.push(remote_path); + } + + // removing segments from remote storage + wal_backup::delete_objects(&objects_to_delete).await?; + + // now we can update the state on disk + let new_state = { + let mut state = self.state.clone(); + state.segments = new_segments; + state + }; + self.commit_state(new_state).await?; + + Ok(()) + } +} + +#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] +pub async fn main_task(tli: Arc, conf: SafeKeeperConf) { + debug!("started"); + let await_duration = conf.partial_backup_timeout; + + let mut cancellation_rx = match tli.get_cancellation_rx() { + Ok(rx) => rx, + Err(_) => { + info!("timeline canceled during task start"); + return; + } + }; + + // sleep for random time to avoid thundering herd + { + let randf64 = rand::thread_rng().gen_range(0.0..1.0); + let sleep_duration = await_duration.mul_f64(randf64); + tokio::time::sleep(sleep_duration).await; + } + + let (_, persistent_state) = tli.get_state().await; + let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); + let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); + let wal_seg_size = tli.get_wal_seg_size().await; + + let local_prefix = tli.timeline_dir.clone(); + let remote_prefix = match tli.timeline_dir.strip_prefix(&conf.workdir) { + Ok(path) => path.to_owned(), + Err(e) => { + error!("failed to strip workspace dir prefix: {:?}", e); + return; + } + }; + + let mut backup = PartialBackup { + wal_seg_size, + tli, + state: persistent_state.partial_backup, + conf, + local_prefix, + remote_prefix, + }; + + debug!("state: {:?}", backup.state); + + 'outer: loop { + // wait until we have something to upload + let uploaded_segment = backup.state.uploaded_segment(); + if let Some(seg) = &uploaded_segment { + // if we already uploaded something, wait until we have something new + while flush_lsn_rx.borrow().lsn == seg.flush_lsn + && *commit_lsn_rx.borrow() == seg.commit_lsn + && flush_lsn_rx.borrow().term == seg.term + { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => {} + } + } + } + + // if we don't have any data and zero LSNs, wait for something + while flush_lsn_rx.borrow().lsn == Lsn(0) { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = flush_lsn_rx.changed() => {} + } + } + + // fixing the segno and waiting some time to prevent reuploading the same segment too often + let pending_segno = backup.segno(flush_lsn_rx.borrow().lsn); + let timeout = tokio::time::sleep(await_duration); + tokio::pin!(timeout); + let mut timeout_expired = false; + + // waiting until timeout expires OR segno changes + 'inner: loop { + tokio::select! { + _ = cancellation_rx.changed() => { + info!("timeline canceled"); + return; + } + _ = commit_lsn_rx.changed() => {} + _ = flush_lsn_rx.changed() => { + let segno = backup.segno(flush_lsn_rx.borrow().lsn); + if segno != pending_segno { + // previous segment is no longer partial, aborting the wait + break 'inner; + } + } + _ = &mut timeout => { + // timeout expired, now we are ready for upload + timeout_expired = true; + break 'inner; + } + } + } + + if !timeout_expired { + // likely segno has changed, let's try again in the next iteration + continue 'outer; + } + + let prepared = backup.prepare_upload().await; + if let Some(seg) = &uploaded_segment { + if seg.eq_without_status(&prepared) { + // we already uploaded this segment, nothing to do + continue 'outer; + } + } + + match backup.do_upload(&prepared).await { + Ok(()) => { + debug!( + "uploaded {} up to flush_lsn {}", + prepared.name, prepared.flush_lsn + ); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["ok"]).inc(); + } + Err(e) => { + info!("failed to upload {}: {:#}", prepared.name, e); + PARTIAL_BACKUP_UPLOADS.with_label_values(&["error"]).inc(); + } + } + } +} diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index e3aaf5d391..bc21c4d765 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -176,6 +176,8 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { http_auth: None, current_thread_runtime: false, walsenders_keep_horizon: false, + partial_backup_enabled: false, + partial_backup_timeout: Duration::from_secs(0), }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py index 980f343047..84b69cb36a 100755 --- a/scripts/export_import_between_pageservers.py +++ b/scripts/export_import_between_pageservers.py @@ -64,14 +64,14 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str: Returns basepath for files with captured output. """ assert isinstance(cmd, list) - base = os.path.basename(cmd[0]) + "_{}".format(global_counter()) + base = f"{os.path.basename(cmd[0])}_{global_counter()}" basepath = os.path.join(capture_dir, base) stdout_filename = basepath + ".stdout" stderr_filename = basepath + ".stderr" with open(stdout_filename, "w") as stdout_f: with open(stderr_filename, "w") as stderr_f: - print('(capturing output to "{}.stdout")'.format(base)) + print(f'(capturing output to "{base}.stdout")') subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f) return basepath @@ -82,11 +82,9 @@ class PgBin: def __init__(self, log_dir: Path, pg_distrib_dir, pg_version): self.log_dir = log_dir - self.pg_bin_path = os.path.join(str(pg_distrib_dir), "v{}".format(pg_version), "bin") + self.pg_bin_path = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "bin") self.env = os.environ.copy() - self.env["LD_LIBRARY_PATH"] = os.path.join( - str(pg_distrib_dir), "v{}".format(pg_version), "lib" - ) + self.env["LD_LIBRARY_PATH"] = os.path.join(str(pg_distrib_dir), f"v{pg_version}", "lib") def _fixpath(self, command: List[str]): if "/" not in command[0]: @@ -110,7 +108,7 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) + print(f'Running command "{" ".join(command)}"') env = self._build_env(env) subprocess.run(command, env=env, cwd=cwd, check=True) @@ -128,7 +126,7 @@ class PgBin: """ self._fixpath(command) - print('Running command "{}"'.format(" ".join(command))) + print(f'Running command "{" ".join(command)}"') env = self._build_env(env) return subprocess_capture( str(self.log_dir), command, env=env, cwd=cwd, check=True, **kwargs @@ -300,7 +298,7 @@ class NeonPageserverHttpClient(requests.Session): def lsn_to_hex(num: int) -> str: """Convert lsn from int to standard hex notation.""" - return "{:X}/{:X}".format(num >> 32, num & 0xFFFFFFFF) + return f"{num >> 32:X}/{num & 0xFFFFFFFF:X}" def lsn_from_hex(lsn_hex: str) -> int: @@ -331,16 +329,12 @@ def wait_for_upload( if current_lsn >= lsn: return print( - "waiting for remote_consistent_lsn to reach {}, now {}, iteration {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn), i + 1 - ) + f"waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, now {lsn_to_hex(current_lsn)}, iteration {i + 1}" ) time.sleep(1) raise Exception( - "timed out while waiting for remote_consistent_lsn to reach {}, was {}".format( - lsn_to_hex(lsn), lsn_to_hex(current_lsn) - ) + f"timed out while waiting for remote_consistent_lsn to reach {lsn_to_hex(lsn)}, was {lsn_to_hex(current_lsn)}" ) diff --git a/control_plane/attachment_service/Cargo.toml b/storage_controller/Cargo.toml similarity index 81% rename from control_plane/attachment_service/Cargo.toml rename to storage_controller/Cargo.toml index 0201e0ed86..165cafaf4e 100644 --- a/control_plane/attachment_service/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "attachment_service" +name = "storage_controller" version = "0.1.0" edition.workspace = true license.workspace = true @@ -25,6 +25,7 @@ git-version.workspace = true hex.workspace = true hyper.workspace = true humantime.workspace = true +itertools.workspace = true lasso.workspace = true once_cell.workspace = true pageserver_api.workspace = true @@ -44,8 +45,8 @@ diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] } diesel_migrations = { version = "2.1.0" } r2d2 = { version = "0.8.10" } -utils = { path = "../../libs/utils/" } -metrics = { path = "../../libs/metrics/" } -control_plane = { path = ".." } -workspace_hack = { version = "0.1", path = "../../workspace_hack" } +utils = { path = "../libs/utils/" } +metrics = { path = "../libs/metrics/" } +control_plane = { path = "../control_plane" } +workspace_hack = { version = "0.1", path = "../workspace_hack" } diff --git a/control_plane/attachment_service/migrations/.keep b/storage_controller/migrations/.keep similarity index 100% rename from control_plane/attachment_service/migrations/.keep rename to storage_controller/migrations/.keep diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql rename to storage_controller/migrations/00000000000000_diesel_initial_setup/down.sql diff --git a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql b/storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql rename to storage_controller/migrations/00000000000000_diesel_initial_setup/up.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/down.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql b/storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql rename to storage_controller/migrations/2024-01-07-211257_create_tenant_shards/up.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql rename to storage_controller/migrations/2024-01-07-212945_create_nodes/down.sql diff --git a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql b/storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql rename to storage_controller/migrations/2024-01-07-212945_create_nodes/up.sql diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql rename to storage_controller/migrations/2024-02-29-094122_generations_null/down.sql diff --git a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql b/storage_controller/migrations/2024-02-29-094122_generations_null/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql rename to storage_controller/migrations/2024-02-29-094122_generations_null/up.sql diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql rename to storage_controller/migrations/2024-03-18-184429_rename_policy/down.sql diff --git a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql b/storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql rename to storage_controller/migrations/2024-03-18-184429_rename_policy/up.sql diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql rename to storage_controller/migrations/2024-03-27-133204_tenant_policies/down.sql diff --git a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql b/storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql similarity index 100% rename from control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql rename to storage_controller/migrations/2024-03-27-133204_tenant_policies/up.sql diff --git a/control_plane/attachment_service/src/auth.rs b/storage_controller/src/auth.rs similarity index 100% rename from control_plane/attachment_service/src/auth.rs rename to storage_controller/src/auth.rs diff --git a/control_plane/attachment_service/src/compute_hook.rs b/storage_controller/src/compute_hook.rs similarity index 61% rename from control_plane/attachment_service/src/compute_hook.rs rename to storage_controller/src/compute_hook.rs index 1a8dc6b86d..eb0c4472e4 100644 --- a/control_plane/attachment_service/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use std::{collections::HashMap, time::Duration}; use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; @@ -18,14 +19,26 @@ const SLOWDOWN_DELAY: Duration = Duration::from_secs(5); pub(crate) const API_CONCURRENCY: usize = 32; +struct UnshardedComputeHookTenant { + // Which node is this tenant attached to + node_id: NodeId, + + // Must hold this lock to send a notification. + send_lock: Arc>>, +} struct ShardedComputeHookTenant { stripe_size: ShardStripeSize, shard_count: ShardCount, shards: Vec<(ShardNumber, NodeId)>, + + // Must hold this lock to send a notification. The contents represent + // the last successfully sent notification, and are used to coalesce multiple + // updates by only sending when there is a chance since our last successful send. + send_lock: Arc>>, } enum ComputeHookTenant { - Unsharded(NodeId), + Unsharded(UnshardedComputeHookTenant), Sharded(ShardedComputeHookTenant), } @@ -37,9 +50,20 @@ impl ComputeHookTenant { shards: vec![(tenant_shard_id.shard_number, node_id)], stripe_size, shard_count: tenant_shard_id.shard_count, + send_lock: Arc::default(), }) } else { - Self::Unsharded(node_id) + Self::Unsharded(UnshardedComputeHookTenant { + node_id, + send_lock: Arc::default(), + }) + } + } + + fn get_send_lock(&self) -> &Arc>> { + match self { + Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, + Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, } } @@ -52,8 +76,8 @@ impl ComputeHookTenant { node_id: NodeId, ) { match self { - Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => { - *existing_node_id = node_id + Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => { + unsharded_tenant.node_id = node_id } Self::Sharded(sharded_tenant) if sharded_tenant.stripe_size == stripe_size @@ -80,14 +104,14 @@ impl ComputeHookTenant { } } -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] struct ComputeHookNotifyRequestShard { node_id: NodeId, shard_number: ShardNumber, } /// Request body that we send to the control plane to notify it of where a tenant is attached -#[derive(Serialize, Deserialize, Debug)] +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] struct ComputeHookNotifyRequest { tenant_id: TenantId, stripe_size: Option, @@ -120,14 +144,44 @@ pub(crate) enum NotifyError { Fatal(StatusCode), } +enum MaybeSendResult { + // Please send this request while holding the lock, and if you succeed then write + // the request into the lock. + Transmit( + ( + ComputeHookNotifyRequest, + tokio::sync::OwnedMutexGuard>, + ), + ), + // Something requires sending, but you must wait for a current sender then call again + AwaitLock(Arc>>), + // Nothing requires sending + Noop, +} + impl ComputeHookTenant { - fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option { - match self { - Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest { + fn maybe_send( + &self, + tenant_id: TenantId, + lock: Option>>, + ) -> MaybeSendResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + }; + locked + } + }; + + let request = match self { + Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { tenant_id, shards: vec![ComputeHookNotifyRequestShard { shard_number: ShardNumber(0), - node_id: *node_id, + node_id: unsharded_tenant.node_id, }], stripe_size: None, }), @@ -151,12 +205,25 @@ impl ComputeHookTenant { // Sharded tenant doesn't yet have information for all its shards tracing::info!( - "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})", + "ComputeHookTenant::maybe_send: not enough shards ({}/{})", sharded_tenant.shards.len(), sharded_tenant.shard_count.count() ); None } + }; + + match request { + None => { + // Not yet ready to emit a notification + tracing::info!("Tenant isn't yet ready to emit a notification"); + MaybeSendResult::Noop + } + Some(request) if Some(&request) == locked.as_ref() => { + // No change from the last value successfully sent + MaybeSendResult::Noop + } + Some(request) => MaybeSendResult::Transmit((request, locked)), } } } @@ -166,8 +233,15 @@ impl ComputeHookTenant { /// the compute connection string. pub(super) struct ComputeHook { config: Config, - state: tokio::sync::Mutex>, + state: std::sync::Mutex>, authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_lock + neon_local_lock: tokio::sync::Mutex<()>, } impl ComputeHook { @@ -181,14 +255,20 @@ impl ComputeHook { state: Default::default(), config, authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), } } /// For test environments: use neon_local's LocalEnv to update compute async fn do_notify_local( &self, - reconfigure_request: ComputeHookNotifyRequest, + reconfigure_request: &ComputeHookNotifyRequest, ) -> anyhow::Result<()> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + let env = match LocalEnv::load_config() { Ok(e) => e, Err(e) => { @@ -205,7 +285,7 @@ impl ComputeHook { } = reconfigure_request; let compute_pageservers = shards - .into_iter() + .iter() .map(|shard| { let ps_conf = env .get_pageserver_conf(shard.node_id) @@ -217,10 +297,10 @@ impl ComputeHook { .collect::>(); for (endpoint_name, endpoint) in &cplane.endpoints { - if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running { + if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { tracing::info!("Reconfiguring endpoint {}", endpoint_name,); endpoint - .reconfigure(compute_pageservers.clone(), stripe_size) + .reconfigure(compute_pageservers.clone(), *stripe_size) .await?; } } @@ -298,12 +378,23 @@ impl ComputeHook { async fn do_notify( &self, url: &String, - reconfigure_request: ComputeHookNotifyRequest, + reconfigure_request: &ComputeHookNotifyRequest, cancel: &CancellationToken, ) -> Result<(), NotifyError> { let client = reqwest::Client::new(); + + // We hold these semaphore units across all retries, rather than only across each + // HTTP request: this is to preserve fairness and avoid a situation where a retry might + // time out waiting for a semaphore. + let _units = self + .api_concurrency + .acquire() + .await + // Interpret closed semaphore as shutdown + .map_err(|_| NotifyError::ShuttingDown)?; + backoff::retry( - || self.do_notify_iteration(&client, url, &reconfigure_request, cancel), + || self.do_notify_iteration(&client, url, reconfigure_request, cancel), |e| { matches!( e, @@ -343,42 +434,70 @@ impl ComputeHook { stripe_size: ShardStripeSize, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let mut locked = self.state.lock().await; + let maybe_send_result = { + let mut state_locked = self.state.lock().unwrap(); - use std::collections::hash_map::Entry; - let tenant = match locked.entry(tenant_shard_id.tenant_id) { - Entry::Vacant(e) => e.insert(ComputeHookTenant::new( - tenant_shard_id, - stripe_size, - node_id, - )), - Entry::Occupied(e) => { - let tenant = e.into_mut(); - tenant.update(tenant_shard_id, stripe_size, node_id); - tenant + use std::collections::hash_map::Entry; + let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + Entry::Vacant(e) => e.insert(ComputeHookTenant::new( + tenant_shard_id, + stripe_size, + node_id, + )), + Entry::Occupied(e) => { + let tenant = e.into_mut(); + tenant.update(tenant_shard_id, stripe_size, node_id); + tenant + } + }; + tenant.maybe_send(tenant_shard_id.tenant_id, None) + }; + + // Process result: we may get an update to send, or we may have to wait for a lock + // before trying again. + let (request, mut send_lock_guard) = match maybe_send_result { + MaybeSendResult::Noop => { + return Ok(()); } + MaybeSendResult::AwaitLock(send_lock) => { + let send_locked = send_lock.lock_owned().await; + + // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here + // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses + // try_lock. + let state_locked = self.state.lock().unwrap(); + let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + return Ok(()); + }; + match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + MaybeSendResult::AwaitLock(_) => { + unreachable!("We supplied lock guard") + } + MaybeSendResult::Noop => { + return Ok(()); + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), + } + } + MaybeSendResult::Transmit((request, lock)) => (request, lock), }; - let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id); - let Some(reconfigure_request) = reconfigure_request else { - // The tenant doesn't yet have pageservers for all its shards: we won't notify anything - // until it does. - tracing::info!("Tenant isn't yet ready to emit a notification"); - return Ok(()); - }; - - if let Some(notify_url) = &self.config.compute_hook_url { - self.do_notify(notify_url, reconfigure_request, cancel) - .await + let result = if let Some(notify_url) = &self.config.compute_hook_url { + self.do_notify(notify_url, &request, cancel).await } else { - self.do_notify_local(reconfigure_request) - .await - .map_err(|e| { - // This path is for testing only, so munge the error into our prod-style error type. - tracing::error!("Local notification hook failed: {e}"); - NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) - }) + self.do_notify_local(&request).await.map_err(|e| { + // This path is for testing only, so munge the error into our prod-style error type. + tracing::error!("Local notification hook failed: {e}"); + NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) + }) + }; + + if result.is_ok() { + // Before dropping the send lock, stash the request we just sent so that + // subsequent callers can avoid redundantly re-sending the same thing. + *send_lock_guard = Some(request); } + result } } @@ -402,21 +521,22 @@ pub(crate) mod tests { NodeId(1), ); - // An unsharded tenant is always ready to emit a notification - assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .shards - .len(), - 1 - ); - assert!(tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .stripe_size - .is_none()); + // An unsharded tenant is always ready to emit a notification, but won't + // send the same one twice + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 1); + assert!(request.stripe_size.is_none()); + + // Simulate successful send + *guard = Some(request); + drop(guard); + + // Try asking again: this should be a no-op + let send_result = tenant_state.maybe_send(tenant_id, None); + assert!(matches!(send_result, MaybeSendResult::Noop)); // Writing the first shard of a multi-sharded situation (i.e. in a split) // resets the tenant state and puts it in an non-notifying state (need to @@ -430,7 +550,10 @@ pub(crate) mod tests { ShardStripeSize(32768), NodeId(1), ); - assert!(tenant_state.maybe_reconfigure(tenant_id).is_none()); + assert!(matches!( + tenant_state.maybe_send(tenant_id, None), + MaybeSendResult::Noop + )); // Writing the second shard makes it ready to notify tenant_state.update( @@ -443,22 +566,16 @@ pub(crate) mod tests { NodeId(1), ); - assert!(tenant_state.maybe_reconfigure(tenant_id).is_some()); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .shards - .len(), - 2 - ); - assert_eq!( - tenant_state - .maybe_reconfigure(tenant_id) - .unwrap() - .stripe_size, - Some(ShardStripeSize(32768)) - ); + let send_result = tenant_state.maybe_send(tenant_id, None); + let MaybeSendResult::Transmit((request, mut guard)) = send_result else { + anyhow::bail!("Wrong send result"); + }; + assert_eq!(request.shards.len(), 2); + assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + + // Simulate successful send + *guard = Some(request); + drop(guard); Ok(()) } diff --git a/control_plane/attachment_service/src/heartbeater.rs b/storage_controller/src/heartbeater.rs similarity index 100% rename from control_plane/attachment_service/src/heartbeater.rs rename to storage_controller/src/heartbeater.rs diff --git a/control_plane/attachment_service/src/http.rs b/storage_controller/src/http.rs similarity index 96% rename from control_plane/attachment_service/src/http.rs rename to storage_controller/src/http.rs index 1f3f78bffa..c59bcaa174 100644 --- a/control_plane/attachment_service/src/http.rs +++ b/storage_controller/src/http.rs @@ -399,6 +399,15 @@ async fn handle_tenant_describe( json_response(StatusCode::OK, service.tenant_describe(tenant_id)?) } +async fn handle_tenant_list( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + json_response(StatusCode::OK, service.tenant_list()) +} + async fn handle_node_register(mut req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -412,7 +421,10 @@ async fn handle_node_list(req: Request) -> Result, ApiError check_permissions(&req, Scope::Admin)?; let state = get_state(&req); - json_response(StatusCode::OK, state.service.node_list().await?) + let nodes = state.service.node_list().await?; + let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::>(); + + json_response(StatusCode::OK, api_nodes) } async fn handle_node_drop(req: Request) -> Result, ApiError> { @@ -590,9 +602,17 @@ where .await } +/// Check if the required scope is held in the request's token, or if the request has +/// a token with 'admin' scope then always permit it. fn check_permissions(request: &Request, required_scope: Scope) -> Result<(), ApiError> { check_permission_with(request, |claims| { - crate::auth::check_permission(claims, required_scope) + match crate::auth::check_permission(claims, required_scope) { + Err(e) => match crate::auth::check_permission(claims, Scope::Admin) { + Ok(()) => Ok(()), + Err(_) => Err(e), + }, + Ok(()) => Ok(()), + } }) } @@ -793,6 +813,9 @@ pub fn make_router( RequestName("control_v1_tenant_describe"), ) }) + .get("/control/v1/tenant", |r| { + tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list")) + }) .put("/control/v1/tenant/:tenant_id/policy", |r| { named_request_span( r, diff --git a/control_plane/attachment_service/src/id_lock_map.rs b/storage_controller/src/id_lock_map.rs similarity index 100% rename from control_plane/attachment_service/src/id_lock_map.rs rename to storage_controller/src/id_lock_map.rs diff --git a/control_plane/attachment_service/src/lib.rs b/storage_controller/src/lib.rs similarity index 98% rename from control_plane/attachment_service/src/lib.rs rename to storage_controller/src/lib.rs index 8bcd5c0ac4..2ea490a14b 100644 --- a/control_plane/attachment_service/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -14,7 +14,7 @@ mod reconciler; mod scheduler; mod schema; pub mod service; -mod tenant_state; +mod tenant_shard; #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)] struct Sequence(u64); diff --git a/control_plane/attachment_service/src/main.rs b/storage_controller/src/main.rs similarity index 96% rename from control_plane/attachment_service/src/main.rs rename to storage_controller/src/main.rs index bd8d7f5c59..3c03d6efe8 100644 --- a/control_plane/attachment_service/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,18 +1,19 @@ use anyhow::{anyhow, Context}; -use attachment_service::http::make_router; -use attachment_service::metrics::preinitialize_metrics; -use attachment_service::persistence::Persistence; -use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; use camino::Utf8PathBuf; use clap::Parser; use diesel::Connection; use metrics::launch_timestamp::LaunchTimestamp; use std::sync::Arc; +use storage_controller::http::make_router; +use storage_controller::metrics::preinitialize_metrics; +use storage_controller::persistence::Persistence; +use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT}; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; use utils::auth::{JwtAuth, SwappableJwtAuth}; use utils::logging::{self, LogFormat}; +use utils::sentry_init::init_sentry; use utils::{project_build_tag, project_git_version, tcp_listener}; project_git_version!(GIT_VERSION); @@ -50,7 +51,7 @@ struct Cli { #[arg(short, long)] path: Option, - /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service + /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller #[arg(long)] database_url: Option, @@ -158,6 +159,8 @@ fn main() -> anyhow::Result<()> { std::process::exit(1); })); + let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); + tokio::runtime::Builder::new_current_thread() // We use spawn_blocking for database operations, so require approximately // as many blocking threads as we will open database connections. diff --git a/control_plane/attachment_service/src/metrics.rs b/storage_controller/src/metrics.rs similarity index 100% rename from control_plane/attachment_service/src/metrics.rs rename to storage_controller/src/metrics.rs diff --git a/control_plane/attachment_service/src/node.rs b/storage_controller/src/node.rs similarity index 93% rename from control_plane/attachment_service/src/node.rs rename to storage_controller/src/node.rs index df40bff66f..7ba6828deb 100644 --- a/control_plane/attachment_service/src/node.rs +++ b/storage_controller/src/node.rs @@ -3,7 +3,8 @@ use std::{str::FromStr, time::Duration}; use hyper::StatusCode; use pageserver_api::{ controller_api::{ - NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard, + NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy, + TenantLocateResponseShard, }, shard::TenantShardId, }; @@ -256,6 +257,19 @@ impl Node { ) .await } + + /// Generate the simplified API-friendly description of a node's state + pub(crate) fn describe(&self) -> NodeDescribeResponse { + NodeDescribeResponse { + id: self.id, + availability: self.availability.into(), + scheduling: self.scheduling, + listen_http_addr: self.listen_http_addr.clone(), + listen_http_port: self.listen_http_port, + listen_pg_addr: self.listen_pg_addr.clone(), + listen_pg_port: self.listen_pg_port, + } + } } impl std::fmt::Display for Node { diff --git a/control_plane/attachment_service/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs similarity index 100% rename from control_plane/attachment_service/src/pageserver_client.rs rename to storage_controller/src/pageserver_client.rs diff --git a/control_plane/attachment_service/src/persistence.rs b/storage_controller/src/persistence.rs similarity index 99% rename from control_plane/attachment_service/src/persistence.rs rename to storage_controller/src/persistence.rs index d60392bdbc..55fbfd10bc 100644 --- a/control_plane/attachment_service/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -696,7 +696,7 @@ impl Persistence { } } -/// Parts of [`crate::tenant_state::TenantState`] that are stored durably +/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] #[diesel(table_name = crate::schema::tenant_shards)] pub(crate) struct TenantShardPersistence { diff --git a/control_plane/attachment_service/src/persistence/split_state.rs b/storage_controller/src/persistence/split_state.rs similarity index 100% rename from control_plane/attachment_service/src/persistence/split_state.rs rename to storage_controller/src/persistence/split_state.rs diff --git a/control_plane/attachment_service/src/reconciler.rs b/storage_controller/src/reconciler.rs similarity index 99% rename from control_plane/attachment_service/src/reconciler.rs rename to storage_controller/src/reconciler.rs index 72eb8faccb..49cfaad569 100644 --- a/control_plane/attachment_service/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; use crate::node::Node; -use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation}; +use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation}; const DEFAULT_HEATMAP_PERIOD: &str = "60s"; /// Object with the lifetime of the background reconcile task that is created /// for tenants which have a difference between their intent and observed states. pub(super) struct Reconciler { - /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot + /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot /// of a tenant's state from when we spawned a reconcile task. pub(super) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, @@ -48,11 +48,11 @@ pub(super) struct Reconciler { /// To avoid stalling if the cloud control plane is unavailable, we may proceed /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed - /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry. + /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, /// A means to abort background reconciliation: it is essential to - /// call this when something changes in the original TenantState that + /// call this when something changes in the original TenantShard that /// will make this reconciliation impossible or unnecessary, for /// example when a pageserver node goes offline, or the PlacementPolicy for /// the tenant is changed. @@ -66,7 +66,7 @@ pub(super) struct Reconciler { pub(crate) persistence: Arc, } -/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any +/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any /// reference counting for Scheduler. The IntentState is what the scheduler works with, /// and the TargetState is just the instruction for a particular Reconciler run. #[derive(Debug)] diff --git a/control_plane/attachment_service/src/scheduler.rs b/storage_controller/src/scheduler.rs similarity index 98% rename from control_plane/attachment_service/src/scheduler.rs rename to storage_controller/src/scheduler.rs index 782189d11f..862ac0cbfe 100644 --- a/control_plane/attachment_service/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -1,4 +1,4 @@ -use crate::{node::Node, tenant_state::TenantState}; +use crate::{node::Node, tenant_shard::TenantShard}; use pageserver_api::controller_api::UtilizationScore; use serde::Serialize; use std::collections::HashMap; @@ -27,7 +27,7 @@ pub enum MaySchedule { #[derive(Serialize)] struct SchedulerNode { - /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`]. + /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. shard_count: usize, /// Whether this node is currently elegible to have new shards scheduled (this is derived @@ -84,7 +84,7 @@ impl std::ops::Add for AffinityScore { } } -// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling +// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling // it for many shards in the same tenant. #[derive(Debug, Default)] pub(crate) struct ScheduleContext { @@ -147,7 +147,7 @@ impl Scheduler { pub(crate) fn consistency_check<'a>( &self, nodes: impl Iterator, - shards: impl Iterator, + shards: impl Iterator, ) -> anyhow::Result<()> { let mut expect_nodes: HashMap = HashMap::new(); for node in nodes { @@ -398,7 +398,7 @@ pub(crate) mod test_utils { mod tests { use super::*; - use crate::tenant_state::IntentState; + use crate::tenant_shard::IntentState; #[test] fn scheduler_basic() -> anyhow::Result<()> { let nodes = test_utils::make_test_nodes(2); diff --git a/control_plane/attachment_service/src/schema.rs b/storage_controller/src/schema.rs similarity index 100% rename from control_plane/attachment_service/src/schema.rs rename to storage_controller/src/schema.rs diff --git a/control_plane/attachment_service/src/service.rs b/storage_controller/src/service.rs similarity index 97% rename from control_plane/attachment_service/src/service.rs rename to storage_controller/src/service.rs index 7502d9d186..010558b797 100644 --- a/control_plane/attachment_service/src/service.rs +++ b/storage_controller/src/service.rs @@ -20,6 +20,7 @@ use control_plane::storage_controller::{ use diesel::result::DatabaseErrorKind; use futures::{stream::FuturesUnordered, StreamExt}; use hyper::StatusCode; +use itertools::Itertools; use pageserver_api::{ controller_api::{ NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, @@ -65,9 +66,9 @@ use crate::{ persistence::{split_state::SplitState, DatabaseError, Persistence, TenantShardPersistence}, reconciler::attached_location_conf, scheduler::Scheduler, - tenant_state::{ + tenant_shard::{ IntentState, ObservedState, ObservedStateLocation, ReconcileResult, ReconcileWaitError, - ReconcilerWaiter, TenantState, + ReconcilerWaiter, TenantShard, }, }; @@ -91,7 +92,7 @@ pub const MAX_UNAVAILABLE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); // Top level state available to all HTTP handlers struct ServiceState { - tenants: BTreeMap, + tenants: BTreeMap, nodes: Arc>, @@ -101,7 +102,7 @@ struct ServiceState { impl ServiceState { fn new( nodes: HashMap, - tenants: BTreeMap, + tenants: BTreeMap, scheduler: Scheduler, ) -> Self { Self { @@ -115,7 +116,7 @@ impl ServiceState { &mut self, ) -> ( &mut Arc>, - &mut BTreeMap, + &mut BTreeMap, &mut Scheduler, ) { (&mut self.nodes, &mut self.tenants, &mut self.scheduler) @@ -334,11 +335,11 @@ impl Service { for (tenant_shard_id, shard_observations) in observed { for (node_id, observed_loc) in shard_observations { - let Some(tenant_state) = tenants.get_mut(&tenant_shard_id) else { + let Some(tenant_shard) = tenants.get_mut(&tenant_shard_id) else { cleanup.push((tenant_shard_id, node_id)); continue; }; - tenant_state + tenant_shard .observed .locations .insert(node_id, ObservedStateLocation { conf: observed_loc }); @@ -347,14 +348,14 @@ impl Service { // Populate each tenant's intent state let mut schedule_context = ScheduleContext::default(); - for (tenant_shard_id, tenant_state) in tenants.iter_mut() { + for (tenant_shard_id, tenant_shard) in tenants.iter_mut() { if tenant_shard_id.shard_number == ShardNumber(0) { // Reset scheduling context each time we advance to the next Tenant schedule_context = ScheduleContext::default(); } - tenant_state.intent_from_observed(scheduler); - if let Err(e) = tenant_state.schedule(scheduler, &mut schedule_context) { + tenant_shard.intent_from_observed(scheduler); + if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) { // Non-fatal error: we are unable to properly schedule the tenant, perhaps because // not enough pageservers are available. The tenant may well still be available // to clients. @@ -363,11 +364,11 @@ impl Service { // If we're both intending and observed to be attached at a particular node, we will // emit a compute notification for this. In the case where our observed state does not // yet match our intent, we will eventually reconcile, and that will emit a compute notification. - if let Some(attached_at) = tenant_state.stably_attached() { + if let Some(attached_at) = tenant_shard.stably_attached() { compute_notifications.push(( *tenant_shard_id, attached_at, - tenant_state.shard.stripe_size, + tenant_shard.shard.stripe_size, )); } } @@ -742,7 +743,7 @@ impl Service { /// Apply the contents of a [`ReconcileResult`] to our in-memory state: if the reconciliation /// was successful, this will update the observed state of the tenant such that subsequent - /// calls to [`TenantState::maybe_reconcile`] will do nothing. + /// calls to [`TenantShard::maybe_reconcile`] will do nothing. #[instrument(skip_all, fields( tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), sequence=%result.sequence @@ -760,10 +761,10 @@ impl Service { tenant.generation = std::cmp::max(tenant.generation, result.generation); // If the reconciler signals that it failed to notify compute, set this state on - // the shard so that a future [`TenantState::maybe_reconcile`] will try again. + // the shard so that a future [`TenantShard::maybe_reconcile`] will try again. tenant.pending_compute_notification = result.pending_compute_notification; - // Let the TenantState know it is idle. + // Let the TenantShard know it is idle. tenant.reconcile_complete(result.sequence); match result.result { @@ -978,7 +979,7 @@ impl Service { if let Some(generation_pageserver) = tsp.generation_pageserver { intent.set_attached(&mut scheduler, Some(NodeId(generation_pageserver as u64))); } - let new_tenant = TenantState::from_persistent(tsp, intent)?; + let new_tenant = TenantShard::from_persistent(tsp, intent)?; tenants.insert(tenant_shard_id, new_tenant); } @@ -1125,7 +1126,7 @@ impl Service { let mut locked = self.inner.write().unwrap(); locked.tenants.insert( attach_req.tenant_shard_id, - TenantState::new( + TenantShard::new( attach_req.tenant_shard_id, ShardIdentity::unsharded(), PlacementPolicy::Attached(0), @@ -1177,32 +1178,32 @@ impl Service { let mut locked = self.inner.write().unwrap(); let (_nodes, tenants, scheduler) = locked.parts_mut(); - let tenant_state = tenants + let tenant_shard = tenants .get_mut(&attach_req.tenant_shard_id) .expect("Checked for existence above"); if let Some(new_generation) = new_generation { - tenant_state.generation = Some(new_generation); - tenant_state.policy = PlacementPolicy::Attached(0); + tenant_shard.generation = Some(new_generation); + tenant_shard.policy = PlacementPolicy::Attached(0); } else { // This is a detach notification. We must update placement policy to avoid re-attaching // during background scheduling/reconciliation, or during storage controller restart. assert!(attach_req.node_id.is_none()); - tenant_state.policy = PlacementPolicy::Detached; + tenant_shard.policy = PlacementPolicy::Detached; } if let Some(attaching_pageserver) = attach_req.node_id.as_ref() { tracing::info!( tenant_id = %attach_req.tenant_shard_id, ps_id = %attaching_pageserver, - generation = ?tenant_state.generation, + generation = ?tenant_shard.generation, "issuing", ); - } else if let Some(ps_id) = tenant_state.intent.get_attached() { + } else if let Some(ps_id) = tenant_shard.intent.get_attached() { tracing::info!( tenant_id = %attach_req.tenant_shard_id, %ps_id, - generation = ?tenant_state.generation, + generation = ?tenant_shard.generation, "dropping", ); } else { @@ -1210,14 +1211,14 @@ impl Service { tenant_id = %attach_req.tenant_shard_id, "no-op: tenant already has no pageserver"); } - tenant_state + tenant_shard .intent .set_attached(scheduler, attach_req.node_id); tracing::info!( "attach_hook: tenant {} set generation {:?}, pageserver {}", attach_req.tenant_shard_id, - tenant_state.generation, + tenant_shard.generation, // TODO: this is an odd number of 0xf's attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) ); @@ -1229,36 +1230,36 @@ impl Service { #[cfg(feature = "testing")] { if let Some(node_id) = attach_req.node_id { - tenant_state.observed.locations = HashMap::from([( + tenant_shard.observed.locations = HashMap::from([( node_id, ObservedStateLocation { conf: Some(attached_location_conf( - tenant_state.generation.unwrap(), - &tenant_state.shard, - &tenant_state.config, + tenant_shard.generation.unwrap(), + &tenant_shard.shard, + &tenant_shard.config, false, )), }, )]); } else { - tenant_state.observed.locations.clear(); + tenant_shard.observed.locations.clear(); } } Ok(AttachHookResponse { gen: attach_req .node_id - .map(|_| tenant_state.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), + .map(|_| tenant_shard.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap()), }) } pub(crate) fn inspect(&self, inspect_req: InspectRequest) -> InspectResponse { let locked = self.inner.read().unwrap(); - let tenant_state = locked.tenants.get(&inspect_req.tenant_shard_id); + let tenant_shard = locked.tenants.get(&inspect_req.tenant_shard_id); InspectResponse { - attachment: tenant_state.and_then(|s| { + attachment: tenant_shard.and_then(|s| { s.intent .get_attached() .map(|ps| (s.generation.expect("Test hook, not used on tenants that are mid-onboarding with a NULL generation").into().unwrap(), ps)) @@ -1320,11 +1321,11 @@ impl Service { let mut locked = self.inner.write().unwrap(); for (tenant_shard_id, observed_loc) in configs.tenant_shards { - let Some(tenant_state) = locked.tenants.get_mut(&tenant_shard_id) else { + let Some(tenant_shard) = locked.tenants.get_mut(&tenant_shard_id) else { cleanup.push(tenant_shard_id); continue; }; - tenant_state + tenant_shard .observed .locations .insert(node.get_id(), ObservedStateLocation { conf: observed_loc }); @@ -1495,13 +1496,13 @@ impl Service { }; for req_tenant in validate_req.tenants { - if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_state.generation == Some(Generation::new(req_tenant.gen)); + if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); tracing::info!( "handle_validate: {}(gen {}): valid={valid} (latest {:?})", req_tenant.id, req_tenant.gen, - tenant_state.generation + tenant_shard.generation ); response.tenants.push(ValidateResponseTenant { id: req_tenant.id, @@ -1687,7 +1688,7 @@ impl Service { continue; } Entry::Vacant(entry) => { - let state = entry.insert(TenantState::new( + let state = entry.insert(TenantShard::new( tenant_shard_id, ShardIdentity::from_params( tenant_shard_id.shard_number, @@ -1762,6 +1763,9 @@ impl Service { /// Part of [`Self::tenant_location_config`]: dissect an incoming location config request, /// and transform it into either a tenant creation of a series of shard updates. + /// + /// If the incoming request makes no changes, a [`TenantCreateOrUpdate::Update`] result will + /// still be returned. fn tenant_location_config_prepare( &self, tenant_id: TenantId, @@ -1809,17 +1813,12 @@ impl Service { _ => None, }; - if shard.policy != placement_policy - || shard.config != req.config.tenant_conf - || set_generation.is_some() - { - updates.push(ShardUpdate { - tenant_shard_id: *shard_id, - placement_policy: placement_policy.clone(), - tenant_config: req.config.tenant_conf.clone(), - generation: set_generation, - }); - } + updates.push(ShardUpdate { + tenant_shard_id: *shard_id, + placement_policy: placement_policy.clone(), + tenant_config: req.config.tenant_conf.clone(), + generation: set_generation, + }); } if create { @@ -1848,6 +1847,7 @@ impl Service { }, ) } else { + assert!(!updates.is_empty()); TenantCreateOrUpdate::Update(updates) } } @@ -2735,47 +2735,73 @@ impl Service { }) } - pub(crate) fn tenant_describe( + /// Returns None if the input iterator of shards does not include a shard with number=0 + fn tenant_describe_impl<'a>( &self, - tenant_id: TenantId, - ) -> Result { - let locked = self.inner.read().unwrap(); - + shards: impl Iterator, + ) -> Option { let mut shard_zero = None; - let mut shards = Vec::new(); + let mut describe_shards = Vec::new(); - for (tenant_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) - { - if tenant_shard_id.is_zero() { + for shard in shards { + if shard.tenant_shard_id.is_zero() { shard_zero = Some(shard); } - let response_shard = TenantDescribeResponseShard { - tenant_shard_id: *tenant_shard_id, + describe_shards.push(TenantDescribeResponseShard { + tenant_shard_id: shard.tenant_shard_id, node_attached: *shard.intent.get_attached(), node_secondary: shard.intent.get_secondary().to_vec(), last_error: shard.last_error.lock().unwrap().clone(), is_reconciling: shard.reconciler.is_some(), is_pending_compute_notification: shard.pending_compute_notification, is_splitting: matches!(shard.splitting, SplitState::Splitting), - }; - shards.push(response_shard); + scheduling_policy: *shard.get_scheduling_policy(), + }) } - let Some(shard_zero) = shard_zero else { - return Err(ApiError::NotFound( - anyhow::anyhow!("Tenant {tenant_id} not found").into(), - )); - }; + let shard_zero = shard_zero?; - Ok(TenantDescribeResponse { - shards, + Some(TenantDescribeResponse { + tenant_id: shard_zero.tenant_shard_id.tenant_id, + shards: describe_shards, stripe_size: shard_zero.shard.stripe_size, policy: shard_zero.policy.clone(), config: shard_zero.config.clone(), }) } + pub(crate) fn tenant_describe( + &self, + tenant_id: TenantId, + ) -> Result { + let locked = self.inner.read().unwrap(); + + self.tenant_describe_impl( + locked + .tenants + .range(TenantShardId::tenant_range(tenant_id)) + .map(|(_k, v)| v), + ) + .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into())) + } + + pub(crate) fn tenant_list(&self) -> Vec { + let locked = self.inner.read().unwrap(); + + let mut result = Vec::new(); + for (_tenant_id, tenant_shards) in + &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id) + { + result.push( + self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v)) + .expect("Groups are always non-empty"), + ); + } + + result + } + #[instrument(skip_all, fields(tenant_id=%op.tenant_id))] async fn abort_tenant_shard_split( &self, @@ -3012,7 +3038,7 @@ impl Service { }, ); - let mut child_state = TenantState::new(child, child_shard, policy.clone()); + let mut child_state = TenantShard::new(child, child_shard, policy.clone()); child_state.intent = IntentState::single(scheduler, Some(pageserver)); child_state.observed = ObservedState { locations: child_observed, @@ -3020,7 +3046,7 @@ impl Service { child_state.generation = Some(generation); child_state.config = config.clone(); - // The child's TenantState::splitting is intentionally left at the default value of Idle, + // The child's TenantShard::splitting is intentionally left at the default value of Idle, // as at this point in the split process we have succeeded and this part is infallible: // we will never need to do any special recovery from this state. @@ -3569,8 +3595,8 @@ impl Service { Ok(()) } - /// For debug/support: a full JSON dump of TenantStates. Returns a response so that - /// we don't have to make TenantState clonable in the return path. + /// For debug/support: a full JSON dump of TenantShards. Returns a response so that + /// we don't have to make TenantShard clonable in the return path. pub(crate) fn tenants_dump(&self) -> Result, ApiError> { let serialized = { let locked = self.inner.read().unwrap(); @@ -3674,7 +3700,7 @@ impl Service { } /// For debug/support: a JSON dump of the [`Scheduler`]. Returns a response so that - /// we don't have to make TenantState clonable in the return path. + /// we don't have to make TenantShard clonable in the return path. pub(crate) fn scheduler_dump(&self) -> Result, ApiError> { let serialized = { let locked = self.inner.read().unwrap(); @@ -3891,8 +3917,8 @@ impl Service { tracing::info!("Node {} transition to offline", node_id); let mut tenants_affected: usize = 0; - for (tenant_shard_id, tenant_state) in tenants { - if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { + for (tenant_shard_id, tenant_shard) in tenants { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { // When a node goes offline, we set its observed configuration to None, indicating unknown: we will // not assume our knowledge of the node's configuration is accurate until it comes back online observed_loc.conf = None; @@ -3905,24 +3931,24 @@ impl Service { continue; } - if tenant_state.intent.demote_attached(node_id) { - tenant_state.sequence = tenant_state.sequence.next(); + if tenant_shard.intent.demote_attached(node_id) { + tenant_shard.sequence = tenant_shard.sequence.next(); // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters // for tenants without secondary locations: if they have a secondary location, then this // schedule() call is just promoting an existing secondary) let mut schedule_context = ScheduleContext::default(); - match tenant_state.schedule(scheduler, &mut schedule_context) { + match tenant_shard.schedule(scheduler, &mut schedule_context) { Err(e) => { // It is possible that some tenants will become unschedulable when too many pageservers // go offline: in this case there isn't much we can do other than make the issue observable. - // TODO: give TenantState a scheduling error attribute to be queried later. + // TODO: give TenantShard a scheduling error attribute to be queried later. tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id); } Ok(()) => { if self - .maybe_reconcile_shard(tenant_state, &new_nodes) + .maybe_reconcile_shard(tenant_shard, &new_nodes) .is_some() { tenants_affected += 1; @@ -3941,10 +3967,10 @@ impl Service { tracing::info!("Node {} transition to active", node_id); // When a node comes back online, we must reconcile any tenant that has a None observed // location on the node. - for tenant_state in locked.tenants.values_mut() { - if let Some(observed_loc) = tenant_state.observed.locations.get_mut(&node_id) { + for tenant_shard in locked.tenants.values_mut() { + if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) { if observed_loc.conf.is_none() { - self.maybe_reconcile_shard(tenant_state, &new_nodes); + self.maybe_reconcile_shard(tenant_shard, &new_nodes); } } } @@ -4027,11 +4053,11 @@ impl Service { Ok(()) } - /// Convenience wrapper around [`TenantState::maybe_reconcile`] that provides + /// Convenience wrapper around [`TenantShard::maybe_reconcile`] that provides /// all the references to parts of Self that are needed fn maybe_reconcile_shard( &self, - shard: &mut TenantState, + shard: &mut TenantShard, nodes: &Arc>, ) -> Option { shard.maybe_reconcile( @@ -4097,7 +4123,7 @@ impl Service { let mut reconciles_spawned = 0; - let mut tenant_shards: Vec<&TenantState> = Vec::new(); + let mut tenant_shards: Vec<&TenantShard> = Vec::new(); // Limit on how many shards' optmizations each call to this function will execute. Combined // with the frequency of background calls, this acts as an implicit rate limit that runs a small @@ -4228,7 +4254,7 @@ impl Service { pub async fn shutdown(&self) { // Note that this already stops processing any results from reconciles: so - // we do not expect that our [`TenantState`] objects will reach a neat + // we do not expect that our [`TenantShard`] objects will reach a neat // final state. self.cancel.cancel(); diff --git a/control_plane/attachment_service/src/tenant_state.rs b/storage_controller/src/tenant_shard.rs similarity index 96% rename from control_plane/attachment_service/src/tenant_state.rs rename to storage_controller/src/tenant_shard.rs index 6717b8e178..58b8ef8d5d 100644 --- a/control_plane/attachment_service/src/tenant_state.rs +++ b/storage_controller/src/tenant_shard.rs @@ -50,7 +50,7 @@ where /// This struct implement Serialize for debugging purposes, but is _not_ persisted /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted. #[derive(Serialize)] -pub(crate) struct TenantState { +pub(crate) struct TenantShard { pub(crate) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, @@ -354,7 +354,7 @@ pub(crate) struct ReconcilerHandle { } /// When a reconcile task completes, it sends this result object -/// to be applied to the primary TenantState. +/// to be applied to the primary TenantShard. pub(crate) struct ReconcileResult { pub(crate) sequence: Sequence, /// On errors, `observed` should be treated as an incompleted description @@ -367,7 +367,7 @@ pub(crate) struct ReconcileResult { pub(crate) generation: Option, pub(crate) observed: ObservedState, - /// Set [`TenantState::pending_compute_notification`] from this flag + /// Set [`TenantShard::pending_compute_notification`] from this flag pub(crate) pending_compute_notification: bool, } @@ -379,7 +379,7 @@ impl ObservedState { } } -impl TenantState { +impl TenantShard { pub(crate) fn new( tenant_shard_id: TenantShardId, shard: ShardIdentity, @@ -1143,7 +1143,7 @@ pub(crate) mod tests { use super::*; - fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState { + fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard { let tenant_id = TenantId::generate(); let shard_number = ShardNumber(0); let shard_count = ShardCount::new(1); @@ -1153,7 +1153,7 @@ pub(crate) mod tests { shard_number, shard_count, }; - TenantState::new( + TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, @@ -1165,7 +1165,7 @@ pub(crate) mod tests { ) } - fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec { + fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec { let tenant_id = TenantId::generate(); (0..shard_count.count()) @@ -1177,7 +1177,7 @@ pub(crate) mod tests { shard_number, shard_count, }; - TenantState::new( + TenantShard::new( tenant_shard_id, ShardIdentity::new( shard_number, @@ -1202,24 +1202,24 @@ pub(crate) mod tests { let mut scheduler = Scheduler::new(nodes.values()); let mut context = ScheduleContext::default(); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); - tenant_state + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); + tenant_shard .schedule(&mut scheduler, &mut context) .expect("we have enough nodes, scheduling should work"); // Expect to initially be schedule on to different nodes - assert_eq!(tenant_state.intent.secondary.len(), 1); - assert!(tenant_state.intent.attached.is_some()); + assert_eq!(tenant_shard.intent.secondary.len(), 1); + assert!(tenant_shard.intent.attached.is_some()); - let attached_node_id = tenant_state.intent.attached.unwrap(); - let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap(); + let attached_node_id = tenant_shard.intent.attached.unwrap(); + let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap(); assert_ne!(attached_node_id, secondary_node_id); // Notifying the attached node is offline should demote it to a secondary - let changed = tenant_state.intent.demote_attached(attached_node_id); + let changed = tenant_shard.intent.demote_attached(attached_node_id); assert!(changed); - assert!(tenant_state.intent.attached.is_none()); - assert_eq!(tenant_state.intent.secondary.len(), 2); + assert!(tenant_shard.intent.attached.is_none()); + assert_eq!(tenant_shard.intent.secondary.len(), 2); // Update the scheduler state to indicate the node is offline nodes @@ -1229,18 +1229,18 @@ pub(crate) mod tests { scheduler.node_upsert(nodes.get(&attached_node_id).unwrap()); // Scheduling the node should promote the still-available secondary node to attached - tenant_state + tenant_shard .schedule(&mut scheduler, &mut context) .expect("active nodes are available"); - assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id); + assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id); // The original attached node should have been retained as a secondary assert_eq!( - *tenant_state.intent.secondary.iter().last().unwrap(), + *tenant_shard.intent.secondary.iter().last().unwrap(), attached_node_id ); - tenant_state.intent.clear(&mut scheduler); + tenant_shard.intent.clear(&mut scheduler); Ok(()) } @@ -1250,48 +1250,48 @@ pub(crate) mod tests { let nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); - tenant_state.observed.locations.insert( + tenant_shard.observed.locations.insert( NodeId(3), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedMulti, generation: Some(2), secondary_conf: None, - shard_number: tenant_state.shard.number.0, - shard_count: tenant_state.shard.count.literal(), - shard_stripe_size: tenant_state.shard.stripe_size.0, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); - tenant_state.observed.locations.insert( + tenant_shard.observed.locations.insert( NodeId(2), ObservedStateLocation { conf: Some(LocationConfig { mode: LocationConfigMode::AttachedStale, generation: Some(1), secondary_conf: None, - shard_number: tenant_state.shard.number.0, - shard_count: tenant_state.shard.count.literal(), - shard_stripe_size: tenant_state.shard.stripe_size.0, + shard_number: tenant_shard.shard.number.0, + shard_count: tenant_shard.shard.count.literal(), + shard_stripe_size: tenant_shard.shard.stripe_size.0, tenant_conf: TenantConfig::default(), }), }, ); - tenant_state.intent_from_observed(&mut scheduler); + tenant_shard.intent_from_observed(&mut scheduler); // The highest generationed attached location gets used as attached - assert_eq!(tenant_state.intent.attached, Some(NodeId(3))); + assert_eq!(tenant_shard.intent.attached, Some(NodeId(3))); // Other locations get used as secondary - assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]); + assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]); - scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?; + scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?; - tenant_state.intent.clear(&mut scheduler); + tenant_shard.intent.clear(&mut scheduler); Ok(()) } @@ -1300,23 +1300,23 @@ pub(crate) mod tests { let nodes = make_test_nodes(3); let mut scheduler = Scheduler::new(nodes.values()); - let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1)); + let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1)); // In pause mode, schedule() shouldn't do anything - tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause; - assert!(tenant_state + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause; + assert!(tenant_shard .schedule(&mut scheduler, &mut ScheduleContext::default()) .is_ok()); - assert!(tenant_state.intent.all_pageservers().is_empty()); + assert!(tenant_shard.intent.all_pageservers().is_empty()); // In active mode, schedule() works - tenant_state.scheduling_policy = ShardSchedulingPolicy::Active; - assert!(tenant_state + tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active; + assert!(tenant_shard .schedule(&mut scheduler, &mut ScheduleContext::default()) .is_ok()); - assert!(!tenant_state.intent.all_pageservers().is_empty()); + assert!(!tenant_shard.intent.all_pageservers().is_empty()); - tenant_state.intent.clear(&mut scheduler); + tenant_shard.intent.clear(&mut scheduler); Ok(()) } @@ -1429,7 +1429,7 @@ pub(crate) mod tests { fn optimize_til_idle( nodes: &HashMap, scheduler: &mut Scheduler, - shards: &mut [TenantState], + shards: &mut [TenantShard], ) { let mut loop_n = 0; loop { diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py index e7959c1764..c32748f6f0 100644 --- a/test_runner/fixtures/benchmark_fixture.py +++ b/test_runner/fixtures/benchmark_fixture.py @@ -482,20 +482,18 @@ def pytest_terminal_summary( terminalreporter.section("Benchmark results", "-") is_header_printed = True - terminalreporter.write( - "{}.{}: ".format(test_report.head_line, recorded_property["name"]) - ) + terminalreporter.write(f"{test_report.head_line}.{recorded_property['name']}: ") unit = recorded_property["unit"] value = recorded_property["value"] if unit == "MB": - terminalreporter.write("{0:,.0f}".format(value), green=True) + terminalreporter.write(f"{value:,.0f}", green=True) elif unit in ("s", "ms") and isinstance(value, float): - terminalreporter.write("{0:,.3f}".format(value), green=True) + terminalreporter.write(f"{value:,.3f}", green=True) elif isinstance(value, float): - terminalreporter.write("{0:,.4f}".format(value), green=True) + terminalreporter.write(f"{value:,.4f}", green=True) else: terminalreporter.write(str(value), green=True) - terminalreporter.line(" {}".format(unit)) + terminalreporter.line(f" {unit}") result_entry.append(recorded_property) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d0519d3406..0e4a58c099 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -520,9 +520,9 @@ class NeonEnvBuilder: self.env = NeonEnv(self) return self.env - def start(self, register_pageservers=False): + def start(self): assert self.env is not None, "environment is not already initialized, call init() first" - self.env.start(register_pageservers=register_pageservers) + self.env.start() def init_start( self, @@ -1115,8 +1115,8 @@ class NeonEnv: log.info(f"Config: {cfg}") self.neon_cli.init(cfg, force=config.config_init_force) - def start(self, register_pageservers=False): - # storage controller starts first, so that pageserver /re-attach calls don't + def start(self): + # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start() @@ -1127,11 +1127,6 @@ class NeonEnv: # reconcile. wait_until(30, 1, storage_controller_ready) - if register_pageservers: - # Special case for forward compat tests, this can be removed later. - for pageserver in self.pageservers: - self.storage_controller.node_register(pageserver) - # Start up broker, pageserver and all safekeepers futs = [] with concurrent.futures.ThreadPoolExecutor( @@ -3605,7 +3600,7 @@ class Safekeeper: return self def stop(self, immediate: bool = False) -> "Safekeeper": - log.info("Stopping safekeeper {}".format(self.id)) + log.info(f"Stopping safekeeper {self.id}") self.env.neon_cli.safekeeper_stop(self.id, immediate) self.running = False return self @@ -4037,13 +4032,13 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint for f in mismatch: f1 = os.path.join(endpoint.pgdata_dir, f) f2 = os.path.join(restored_dir_path, f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd -b {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd -b {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd -b {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd -b {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, error) == ([], []) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 6aebfbc99c..b899b0dac8 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -308,6 +308,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): params=params, ) self.verbose_error(res) + return res.json() def tenant_list_locations(self): res = self.get( @@ -341,8 +342,21 @@ class PageserverHttpClient(requests.Session, MetricsGetter): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore") self.verbose_error(res) - def tenant_status(self, tenant_id: Union[TenantId, TenantShardId]) -> Dict[Any, Any]: - res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") + def tenant_status( + self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False + ) -> Dict[Any, Any]: + """ + :activate: hint the server not to accelerate activation of this tenant in response + to this query. False by default for tests, because they generally want to observed the + system rather than interfering with it. This is true by default on the server side, + because in the field if the control plane is GET'ing a tenant it's a sign that it wants + to do something with it. + """ + params = {} + if not activate: + params["activate"] = "false" + + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}", params=params) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 693771dd3d..4b0dd7a815 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -204,13 +204,11 @@ def wait_for_last_record_lsn( return current_lsn if i % 10 == 0: log.info( - "{}/{} waiting for last_record_lsn to reach {}, now {}, iteration {}".format( - tenant, timeline, lsn, current_lsn, i + 1 - ) + f"{tenant}/{timeline} waiting for last_record_lsn to reach {lsn}, now {current_lsn}, iteration {i + 1}" ) time.sleep(0.1) raise Exception( - "timed out while waiting for last_record_lsn to reach {}, was {}".format(lsn, current_lsn) + f"timed out while waiting for last_record_lsn to reach {lsn}, was {current_lsn}" ) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index ab8717de54..c44628ce06 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -81,9 +81,13 @@ class Workload: return self._endpoint - def __del__(self): + def stop(self): if self._endpoint is not None: self._endpoint.stop() + self._endpoint = None + + def __del__(self): + self.stop() def init(self, pageserver_id: Optional[int] = None): endpoint = self.endpoint(pageserver_id) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 324ef0d516..b66db4d0ab 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -125,19 +125,19 @@ async def run_update_loop_worker(ep: Endpoint, n_txns: int, idx: int): await conn.execute(f"ALTER TABLE {table} SET (autovacuum_enabled = false)") await conn.execute(f"INSERT INTO {table} VALUES (1, 0)") await conn.execute( + f""" + CREATE PROCEDURE updating{table}() as + $$ + DECLARE + i integer; + BEGIN + FOR i IN 1..{n_txns} LOOP + UPDATE {table} SET x = x + 1 WHERE pk=1; + COMMIT; + END LOOP; + END + $$ LANGUAGE plpgsql """ - CREATE PROCEDURE updating{0}() as - $$ - DECLARE - i integer; - BEGIN - FOR i IN 1..{1} LOOP - UPDATE {0} SET x = x + 1 WHERE pk=1; - COMMIT; - END LOOP; - END - $$ LANGUAGE plpgsql - """.format(table, n_txns) ) await conn.execute("SET statement_timeout=0") await conn.execute(f"call updating{table}()") diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index 9777bf6748..54905759bd 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -78,7 +78,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(p), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 2a7a3c41ac..5b69649007 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -84,11 +84,11 @@ def test_branching_with_pgbench( threads = [] if ty == "cascade": - env.neon_cli.create_branch("b{}".format(i + 1), "b{}".format(i), tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant) else: - env.neon_cli.create_branch("b{}".format(i + 1), "b0", tenant_id=tenant) + env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant) - endpoints.append(env.endpoints.create_start("b{}".format(i + 1), tenant_id=tenant)) + endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant)) threads.append( threading.Thread(target=run_pgbench, args=(endpoints[-1].connstr(),), daemon=True) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 5406acc005..208263a22a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -192,6 +192,9 @@ def test_backward_compatibility( assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" +# Forward compatibility is broken due to https://github.com/neondatabase/neon/pull/6530 +# The test is disabled until the next release deployment +@pytest.mark.xfail @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") @@ -226,10 +229,6 @@ def test_forward_compatibility( ) try: - # TODO: remove this once the previous pageserrver version understands - # the 'get_vectored_impl' config - neon_env_builder.pageserver_get_vectored_impl = None - neon_env_builder.num_safekeepers = 3 neon_local_binpath = neon_env_builder.neon_binpath env = neon_env_builder.from_repo_dir( @@ -238,15 +237,11 @@ def test_forward_compatibility( pg_distrib_dir=compatibility_postgres_distrib_dir, ) - # TODO: remove this workaround after release-5090 is no longer the most recent release. - # There was a bug in that code that generates a warning in the storage controller log. - env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*") - # Use current neon_local even though we're using old binaries for # everything else: our test code is written for latest CLI args. env.neon_local_binpath = neon_local_binpath - neon_env_builder.start(register_pageservers=True) + neon_env_builder.start() check_neon_works( env, diff --git a/test_runner/regress/test_large_schema.py b/test_runner/regress/test_large_schema.py index b6ac1aa41f..c5d5b5fe64 100644 --- a/test_runner/regress/test_large_schema.py +++ b/test_runner/regress/test_large_schema.py @@ -74,8 +74,8 @@ def test_large_schema(neon_env_builder: NeonEnvBuilder): cur.execute("select * from pg_depend order by refclassid, refobjid, refobjsubid") # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, env.initial_timeline + timeline_path = ( + f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{env.initial_timeline}/" ) for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py index 2fdee89389..77dc8a35b5 100644 --- a/test_runner/regress/test_layer_bloating.py +++ b/test_runner/regress/test_layer_bloating.py @@ -57,9 +57,7 @@ def test_layer_bloating(neon_simple_env: NeonEnv, vanilla_pg): time.sleep(10) # Check layer file sizes - timeline_path = "{}/tenants/{}/timelines/{}/".format( - env.pageserver.workdir, env.initial_tenant, timeline - ) + timeline_path = f"{env.pageserver.workdir}/tenants/{env.initial_tenant}/timelines/{timeline}/" log.info(f"Check {timeline_path}") for filename in os.listdir(timeline_path): if filename.startswith("00000"): diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 41fa03cdf8..67f68a62af 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -9,7 +9,6 @@ of the pageserver are: - Updates to remote_consistent_lsn may only be made visible after validating generation """ - import enum import re import time @@ -23,6 +22,7 @@ from fixtures.neon_fixtures import ( NeonPageserver, PgBin, S3Scrubber, + flush_ep_to_pageserver, last_flush_lsn_upload, ) from fixtures.pageserver.http import PageserverApiException @@ -31,6 +31,7 @@ from fixtures.pageserver.utils import ( list_prefix, wait_for_last_record_lsn, wait_for_upload, + wait_for_upload_queue_empty, ) from fixtures.remote_storage import ( RemoteStorageKind, @@ -112,7 +113,6 @@ def generate_uploads_and_deletions( last_flush_lsn_upload( env, endpoint, tenant_id, timeline_id, pageserver_id=pageserver.id ) - ps_http.timeline_checkpoint(tenant_id, timeline_id) # Compaction should generate some GC-elegible layers for i in range(0, 2): @@ -122,6 +122,17 @@ def generate_uploads_and_deletions( print_gc_result(gc_result) assert gc_result["layers_removed"] > 0 + # Stop endpoint and flush all data to pageserver, then checkpoint it: this + # ensures that the pageserver is in a fully idle state: there will be no more + # background ingest, no more uploads pending, and therefore no non-determinism + # in subsequent actions like pageserver restarts. + final_lsn = flush_ep_to_pageserver(env, endpoint, tenant_id, timeline_id, pageserver.id) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + # Finish uploads + wait_for_upload(ps_http, tenant_id, timeline_id, final_lsn) + # Finish all remote writes (including deletions) + wait_for_upload_queue_empty(ps_http, tenant_id, timeline_id) + def read_all( env: NeonEnv, tenant_id: Optional[TenantId] = None, timeline_id: Optional[TimelineId] = None @@ -386,9 +397,8 @@ def test_deletion_queue_recovery( if validate_before == ValidateBefore.NO_VALIDATE: failpoints.append( # Prevent deletion lists from being validated, we will test that they are - # dropped properly during recovery. 'pause' is okay here because we kill - # the pageserver with immediate=true - ("control-plane-client-validate", "pause") + # dropped properly during recovery. This is such a long sleep as to be equivalent to "never" + ("control-plane-client-validate", "return(3600000)") ) ps_http.configure_failpoints(failpoints) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index ca6f77c75f..345abdc072 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -498,9 +498,19 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ps_secondary.http_client().tenant_secondary_download(tenant_id) - assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( - ps_secondary, tenant_id, timeline_id - ) + try: + assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers( + ps_secondary, tenant_id, timeline_id + ) + except: + # Do a full listing of the secondary location on errors, to help debug of + # https://github.com/neondatabase/neon/issues/6966 + timeline_path = ps_secondary.timeline_dir(tenant_id, timeline_id) + for path, _dirs, files in os.walk(timeline_path): + for f in files: + log.info(f"Secondary file: {os.path.join(path, f)}") + + raise # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while # walreceiver is still doing something. diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 3e986a8f7b..f446f4f200 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -566,38 +566,6 @@ async def test_sql_over_http2(static_proxy: NeonProxy): assert resp["rows"] == [{"answer": 42}] -def test_sql_over_http_timeout_cancel(static_proxy: NeonProxy): - static_proxy.safe_psql("create role http with login password 'http' superuser") - - static_proxy.safe_psql("create table test_table ( id int primary key )") - - # insert into a table, with a unique constraint, after sleeping for n seconds - query = "WITH temp AS ( \ - SELECT pg_sleep($1) as sleep, $2::int as id \ - ) INSERT INTO test_table (id) SELECT id FROM temp" - - # expect to fail with timeout - res = static_proxy.http_query( - query, - [static_proxy.http_timeout_seconds + 1, 1], - user="http", - password="http", - expected_code=400, - ) - assert "Query cancelled, runtime exceeded" in res["message"], "HTTP query should time out" - - time.sleep(2) - - res = static_proxy.http_query(query, [1, 1], user="http", password="http", expected_code=200) - assert res["command"] == "INSERT", "HTTP query should insert" - assert res["rowCount"] == 1, "HTTP query should insert" - - res = static_proxy.http_query(query, [0, 1], user="http", password="http", expected_code=400) - assert ( - "duplicate key value violates unique constraint" in res["message"] - ), "HTTP query should conflict" - - def test_sql_over_http_connection_cancel(static_proxy: NeonProxy): static_proxy.safe_psql("create role http with login password 'http' superuser") diff --git a/test_runner/regress/test_read_validation.py b/test_runner/regress/test_read_validation.py index effb7e83f9..868b80a561 100644 --- a/test_runner/regress/test_read_validation.py +++ b/test_runner/regress/test_read_validation.py @@ -22,7 +22,7 @@ def test_read_validation(neon_simple_env: NeonEnv): with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -42,14 +42,12 @@ def test_read_validation(neon_simple_env: NeonEnv): log.info("Test table is populated, validating buffer cache") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries > 0, "No buffers cached for the test relation" c.execute( - "select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {}".format( - relfilenode - ) + f"select reltablespace, reldatabase, relfilenode from pg_buffercache where relfilenode = {relfilenode}" ) reln = c.fetchone() assert reln is not None @@ -59,22 +57,20 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select clear_buffer_cache()") cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "Failed to clear buffer cache" log.info("Cache is clear, reading stale page version") c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{}'))".format( - first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn('foo', 'main', 0, '{first[0]}'))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" @@ -87,7 +83,7 @@ def test_read_validation(neon_simple_env: NeonEnv): assert second == direct_latest, "Failed fetch page at latest lsn" cache_entries = query_scalar( - c, "select count(*) from pg_buffercache where relfilenode = {}".format(relfilenode) + c, f"select count(*) from pg_buffercache where relfilenode = {relfilenode}" ) assert cache_entries == 0, "relation buffers detected after invalidation" @@ -96,9 +92,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -108,9 +102,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, NULL ))".format( - reln[0], reln[1], reln[2] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, NULL))" ) direct_latest = c.fetchone() assert second == direct_latest, "Failed fetch page at latest lsn" @@ -122,9 +114,7 @@ def test_read_validation(neon_simple_env: NeonEnv): ) c.execute( - "select lsn, lower, upper from page_header(get_raw_page_at_lsn( {}, {}, {}, 0, 0, '{}' ))".format( - reln[0], reln[1], reln[2], first[0] - ) + f"select lsn, lower, upper from page_header(get_raw_page_at_lsn({reln[0]}, {reln[1]}, {reln[2]}, 0, 0, '{first[0]}'))" ) direct_first = c.fetchone() assert first == direct_first, "Failed fetch page at historic lsn using oid" @@ -134,7 +124,7 @@ def test_read_validation(neon_simple_env: NeonEnv): c.execute("select * from page_header(get_raw_page('foo', 'main', 0));") raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") def test_read_validation_neg(neon_simple_env: NeonEnv): @@ -148,7 +138,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): with closing(endpoint.connect()) as con: with con.cursor() as c: for e in extensions: - c.execute("create extension if not exists {};".format(e)) + c.execute(f"create extension if not exists {e};") log.info("read a page of a missing relation") try: @@ -157,7 +147,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): ) raise AssertionError("query should have failed") except UndefinedTable as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") c.execute("create table foo (c int) with (autovacuum_enabled = false)") c.execute("insert into foo values (1)") @@ -169,7 +159,7 @@ def test_read_validation_neg(neon_simple_env: NeonEnv): ) raise AssertionError("query should have failed") except IoError as e: - log.info("Caught an expected failure: {}".format(e)) + log.info(f"Caught an expected failure: {e}") log.info("Pass NULL as an input") expected = (None, None, None) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 2699654f80..bfaab9125f 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -10,9 +10,13 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + S3Scrubber, StorageControllerApiException, + last_flush_lsn_upload, tenant_get_shards, + wait_for_last_flush_lsn, ) +from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty from fixtures.remote_storage import s3_storage from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId from fixtures.utils import wait_until @@ -67,6 +71,15 @@ def test_sharding_smoke( log.info(f"sizes = {sizes}") return sizes + # The imported initdb for timeline creation should + # not be fully imported on every shard. We use a 1MB strripe size so expect + # pretty good distribution: no one shard should have more than half the data + sizes = get_sizes() + physical_initdb_total = sum(sizes.values()) + expect_initdb_size = 20 * 1024 * 1024 + assert physical_initdb_total > expect_initdb_size + assert all(s < expect_initdb_size // 2 for s in sizes.values()) + # Test that timeline creation works on a sharded tenant timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) @@ -99,6 +112,38 @@ def test_sharding_smoke( env.storage_controller.consistency_check() + # Validate that deleting a sharded tenant removes all files in the prefix + + # Before deleting, stop the client and check we have some objects to delete + workload.stop() + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + # Check the scrubber isn't confused by sharded content, then disable + # it during teardown because we'll have deleted by then + S3Scrubber(neon_env_builder).scan_metadata() + neon_env_builder.scrub_on_exit = False + + env.storage_controller.pageserver_api().tenant_delete(tenant_id) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix="/".join( + ( + "tenants", + str(tenant_id), + ) + ), + ) + + env.storage_controller.consistency_check() + def test_sharding_split_unsharded( neon_env_builder: NeonEnvBuilder, @@ -466,13 +511,11 @@ def test_sharding_split_stripe_size( os.getenv("BUILD_TYPE") == "debug", reason="Avoid running bulkier ingest tests in debug mode", ) -def test_sharding_ingest( +def test_sharding_ingest_layer_sizes( neon_env_builder: NeonEnvBuilder, ): """ - Check behaviors related to ingest: - - That we generate properly sized layers - - TODO: that updates to remote_consistent_lsn are made correctly via safekeepers + Check that when ingesting data to a sharded tenant, we properly respect layer size limts. """ # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic @@ -503,6 +546,7 @@ def test_sharding_ingest( workload.write_rows(4096, upload=False) workload.write_rows(4096, upload=False) workload.write_rows(4096, upload=False) + workload.validate() small_layer_count = 0 @@ -515,7 +559,9 @@ def test_sharding_ingest( shard_id = shard["shard_id"] layer_map = pageserver.http_client().layer_map_info(shard_id, timeline_id) - for layer in layer_map.historic_layers: + historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) + + for layer in historic_layers: assert layer.layer_file_size is not None if layer.layer_file_size < expect_layer_size // 2: classification = "Small" @@ -552,6 +598,93 @@ def test_sharding_ingest( assert huge_layer_count <= shard_count +def test_sharding_ingest_gaps( + neon_env_builder: NeonEnvBuilder, +): + """ + Check ingest behavior when the incoming data results in some shards having gaps where + no data is ingested: they should advance their disk_consistent_lsn and remote_consistent_lsn + even if they aren't writing out layers. + """ + + # Set a small stripe size and checkpoint distance, so that we can exercise rolling logic + # without writing a lot of data. + expect_layer_size = 131072 + checkpoint_interval_secs = 5 + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{expect_layer_size}", + "compaction_target_size": f"{expect_layer_size}", + # Set a short checkpoint interval as we will wait for uploads to happen + "checkpoint_timeout": f"{checkpoint_interval_secs}s", + # Background checkpointing is done from compaction loop, so set that interval short too + "compaction_period": "1s", + } + shard_count = 4 + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + initial_tenant_shard_stripe_size=128, + ) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # Just a few writes: we aim to produce a situation where some shards are skipping + # ingesting some records and thereby won't have layer files that advance their + # consistent LSNs, to exercise the code paths that explicitly handle this case by + # advancing consistent LSNs in the background if there is no open layer. + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(128, upload=False) + workload.churn_rows(128, upload=False) + + # Checkpoint, so that we won't get a background checkpoint happening during the next step + workload.endpoint().safe_psql("checkpoint") + # Freeze + flush, so that subsequent writes will start from a position of no open layers + last_flush_lsn_upload(env, workload.endpoint(), tenant_id, timeline_id) + + # This write is tiny: at least some of the shards should find they don't have any + # data to ingest. This will exercise how they handle that. + workload.churn_rows(1, upload=False) + + # The LSN that has reached pageservers, but may not have been flushed to historic layers yet + expect_lsn = wait_for_last_flush_lsn(env, workload.endpoint(), tenant_id, timeline_id) + + # Don't leave the endpoint running, we don't want it writing in the background + workload.stop() + + log.info(f"Waiting for shards' consistent LSNs to reach {expect_lsn}") + + shards = tenant_get_shards(env, tenant_id, None) + + def assert_all_disk_consistent(): + """ + Assert that all the shards' disk_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent) + + def assert_all_remote_consistent(): + """ + Assert that all the shards' remote_consistent_lsns have reached expect_lsn + """ + for tenant_shard_id, pageserver in shards: + timeline_detail = pageserver.http_client().timeline_detail(tenant_shard_id, timeline_id) + log.info(f"{tenant_shard_id} (ps {pageserver.id}) detail: {timeline_detail}") + assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn + + # We set a short checkpoint timeout: expect things to get frozen+flushed within that + wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent) + + workload.validate() + + class Failure: pageserver_id: Optional[int] @@ -795,6 +928,8 @@ def test_sharding_split_failures( ".*Reconcile error: receive body: error sending request for url.*", # Node offline cases will fail inside reconciler when detaching secondaries ".*Reconcile error on shard.*: receive body: error sending request for url.*", + # While parent shard's client is stopped during split, flush loop updating LSNs will emit this warning + ".*Failed to schedule metadata upload after updating disk_consistent_lsn.*", ] ) diff --git a/test_runner/regress/test_sharding_service.py b/test_runner/regress/test_storage_controller.py similarity index 88% rename from test_runner/regress/test_sharding_service.py rename to test_runner/regress/test_storage_controller.py index 5a86e03d2b..840f354142 100644 --- a/test_runner/regress/test_sharding_service.py +++ b/test_runner/regress/test_storage_controller.py @@ -1,3 +1,4 @@ +import json import time from collections import defaultdict from datetime import datetime, timezone @@ -24,7 +25,7 @@ from fixtures.pageserver.utils import ( from fixtures.pg_version import PgVersion from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.types import TenantId, TenantShardId, TimelineId -from fixtures.utils import run_pg_bench_small, wait_until +from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until from mypy_boto3_s3.type_defs import ( ObjectTypeDef, ) @@ -41,11 +42,11 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): return counts -def test_sharding_service_smoke( +def test_storage_controller_smoke( neon_env_builder: NeonEnvBuilder, ): """ - Test the basic lifecycle of a sharding service: + Test the basic lifecycle of a storage controller: - Restarting - Restarting a pageserver - Creating and deleting tenants and timelines @@ -203,7 +204,7 @@ def test_node_status_after_restart( env.storage_controller.consistency_check() -def test_sharding_service_passthrough( +def test_storage_controller_passthrough( neon_env_builder: NeonEnvBuilder, ): """ @@ -230,7 +231,7 @@ def test_sharding_service_passthrough( env.storage_controller.consistency_check() -def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_a = env.initial_tenant tenant_b = TenantId.generate() @@ -265,7 +266,7 @@ def test_sharding_service_restart(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("warm_up", [True, False]) -def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): +def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): """ We onboard tenants to the sharding service by treating it as a 'virtual pageserver' which provides the /location_config API. This is similar to creating a tenant, @@ -302,7 +303,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: origin_ps.http_client().tenant_create(tenant_id, generation=generation) # As if doing a live migration, first configure origin into stale mode - origin_ps.http_client().tenant_location_conf( + r = origin_ps.http_client().tenant_location_conf( tenant_id, { "mode": "AttachedStale", @@ -311,6 +312,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 if warm_up: origin_ps.http_client().tenant_heatmap_upload(tenant_id) @@ -331,7 +333,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # Call into storage controller to onboard the tenant generation += 1 - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedMulti", @@ -340,6 +342,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 # As if doing a live migration, detach the original pageserver origin_ps.http_client().tenant_location_conf( @@ -356,7 +359,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # set it to AttachedSingle: this is a no-op, but we test it because the # cloud control plane may call this for symmetry with live migration to # an individual pageserver - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedSingle", @@ -365,6 +368,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 # We should see the tenant is now attached to the pageserver managed # by the sharding service @@ -395,7 +399,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: # The generation has moved on since we onboarded assert generation != dest_tenant_before_conf_change["generation"] - virtual_ps_http.tenant_location_conf( + r = virtual_ps_http.tenant_location_conf( tenant_id, { "mode": "AttachedSingle", @@ -405,6 +409,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: "generation": generation, }, ) + assert len(r["shards"]) == 1 dest_tenant_after_conf_change = dest_ps.http_client().tenant_status(tenant_id) assert ( dest_tenant_after_conf_change["generation"] == dest_tenant_before_conf_change["generation"] @@ -415,7 +420,7 @@ def test_sharding_service_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: env.storage_controller.consistency_check() -def test_sharding_service_compute_hook( +def test_storage_controller_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, httpserver_listen_address, @@ -528,7 +533,7 @@ def test_sharding_service_compute_hook( env.storage_controller.consistency_check() -def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): """ Verify that occasional-use debug APIs work as expected. This is a lightweight test that just hits the endpoints to check that they don't bitrot. @@ -589,7 +594,7 @@ def test_sharding_service_debug_apis(neon_env_builder: NeonEnvBuilder): env.storage_controller.consistency_check() -def test_sharding_service_s3_time_travel_recovery( +def test_storage_controller_s3_time_travel_recovery( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, ): @@ -699,7 +704,7 @@ def test_sharding_service_s3_time_travel_recovery( env.storage_controller.consistency_check() -def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_auth(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() svc = env.storage_controller @@ -723,13 +728,18 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): StorageControllerApiException, match="Forbidden: JWT authentication error", ): - svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + svc.request( + "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.SAFEKEEPER_DATA) + ) # Token with correct scope svc.request( "POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.PAGE_SERVER_API) ) + # Token with admin scope should also be permitted + svc.request("POST", f"{api}/v1/tenant", json=body, headers=svc.headers(TokenScope.ADMIN)) + # No token with pytest.raises( StorageControllerApiException, @@ -763,7 +773,7 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder): ) -def test_sharding_service_tenant_conf(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_tenant_conf(neon_env_builder: NeonEnvBuilder): """ Validate the pageserver-compatible API endpoints for setting and getting tenant conf, without supplying the whole LocationConf. @@ -866,7 +876,7 @@ def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]: PageserverFailpoint(pageserver_id=1, failpoint="get-utilization-http-handler"), ], ) -def test_sharding_service_heartbeats( +def test_storage_controller_heartbeats( neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, failure: Failure ): neon_env_builder.num_pageservers = 2 @@ -976,7 +986,7 @@ def test_sharding_service_heartbeats( wait_until(10, 1, storage_controller_consistent) -def test_sharding_service_re_attach(neon_env_builder: NeonEnvBuilder): +def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): """ Exercise the behavior of the /re-attach endpoint on pageserver startup when pageservers have a mixture of attached and secondary locations @@ -1131,3 +1141,99 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui # And indeed the tenant should be attached assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + + +def test_storcon_cli(neon_env_builder: NeonEnvBuilder): + """ + The storage controller command line interface (storcon-cli) is an internal tool. Most tests + just use the APIs directly: this test exercises some basics of the CLI as a regression test + that the client remains usable as the server evolves. + """ + output_dir = neon_env_builder.test_output_dir + shard_count = 4 + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + base_args = [env.neon_binpath / "storcon_cli", "--api", env.storage_controller_api] + + def storcon_cli(args): + """ + CLI wrapper: returns stdout split into a list of non-empty strings + """ + (output_path, stdout, status_code) = subprocess_capture( + output_dir, + [str(s) for s in base_args + args], + echo_stderr=True, + echo_stdout=True, + env={}, + check=False, + capture_stdout=True, + timeout=10, + ) + if status_code: + log.warning(f"Command {args} failed") + log.warning(f"Output at: {output_path}") + + raise RuntimeError("CLI failure (check logs for stderr)") + + assert stdout is not None + return [line.strip() for line in stdout.split("\n") if line.strip()] + + # List nodes + node_lines = storcon_cli(["nodes"]) + # Table header, footer, and one line of data + assert len(node_lines) == 5 + assert "localhost" in node_lines[3] + + # Pause scheduling onto a node + storcon_cli(["node-configure", "--node-id", "1", "--scheduling", "pause"]) + assert "Pause" in storcon_cli(["nodes"])[3] + + # We will simulate a node death and then marking it offline + env.pageservers[0].stop(immediate=True) + # Sleep to make it unlikely that the controller's heartbeater will race handling + # a /utilization response internally, such that it marks the node back online. IRL + # there would always be a longer delay than this before a node failing and a human + # intervening. + time.sleep(2) + + storcon_cli(["node-configure", "--node-id", "1", "--availability", "offline"]) + assert "Offline" in storcon_cli(["nodes"])[3] + + # List tenants + tenant_lines = storcon_cli(["tenants"]) + assert len(tenant_lines) == 5 + assert str(env.initial_tenant) in tenant_lines[3] + + # Setting scheduling policies intentionally result in warnings, they're for rare use. + env.storage_controller.allowed_errors.extend( + [".*Skipping reconcile for policy.*", ".*Scheduling is disabled by policy.*"] + ) + + # Describe a tenant + tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)]) + assert len(tenant_lines) == 3 + shard_count * 2 + assert str(env.initial_tenant) in tenant_lines[3] + + # Pause changes on a tenant + storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) + assert "Stop" in storcon_cli(["tenants"])[3] + + # Change a tenant's placement + storcon_cli( + ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] + ) + assert "Secondary" in storcon_cli(["tenants"])[3] + + # Modify a tenant's config + storcon_cli( + [ + "tenant-config", + "--tenant-id", + str(env.initial_tenant), + "--config", + json.dumps({"pitr_interval": "1m"}), + ] + ) + + # Quiesce any background reconciliation before doing consistency check + env.storage_controller.reconcile_until_idle(timeout_secs=10) + env.storage_controller.consistency_check() diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 025cc930d7..4c8fd4b0e5 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -20,9 +20,10 @@ from fixtures.pg_version import PgVersion from fixtures.types import Lsn, TenantId, TimelineId -@pytest.mark.xfail -def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): - env = neon_simple_env +def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + (tenant_id, _) = env.neon_cli.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) @@ -35,66 +36,25 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path): branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0] assert branch_name == main_branch_name - with env.endpoints.create_start( + endpoint = env.endpoints.create_start( main_branch_name, tenant_id=tenant_id, config_lines=["autovacuum=off", "checkpoint_timeout=10min"], - ) as endpoint: - with endpoint.cursor() as cur: - cur.execute("SELECT 1") - row = cur.fetchone() - assert row is not None - assert row[0] == 1 - size = http_client.tenant_size(tenant_id) - # we've disabled the autovacuum and checkpoint - # so background processes should not change the size. - # If this test will flake we should probably loosen the check - assert ( - size == initial_size - ), f"starting idle compute should not change the tenant size (Currently {size}, expected {initial_size})" + ) - # the size should be the same, until we increase the size over the - # gc_horizon - size, inputs = http_client.tenant_size_and_modelinputs(tenant_id) - assert ( - size == initial_size - ), f"tenant_size should not be affected by shutdown of compute (Currently {size}, expected {initial_size})" + with endpoint.cursor() as cur: + cur.execute("SELECT 1") + row = cur.fetchone() + assert row is not None + assert row[0] == 1 - expected_inputs = { - "segments": [ - { - "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchStart", - }, - { - "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True}, - "timeline_id": f"{main_timeline_id}", - "kind": "BranchEnd", - }, - ], - "timeline_inputs": [ - { - "timeline_id": f"{main_timeline_id}", - "ancestor_id": None, - "ancestor_lsn": "0/0", - "last_record": "0/1698CC0", - "latest_gc_cutoff": "0/1698C48", - "horizon_cutoff": "0/0", - "pitr_cutoff": "0/0", - "next_gc_cutoff": "0/0", - "retention_param_cutoff": None, - } - ], - } - expected_inputs = mask_model_inputs(expected_inputs) - actual_inputs = mask_model_inputs(inputs) + # The transaction above will make the compute generate a checkpoint. + # In turn, the pageserver persists the checkpoint. This should only be + # one key with a size of a couple hundred bytes. + wait_for_last_flush_lsn(env, endpoint, tenant_id, main_timeline_id) + size = http_client.tenant_size(tenant_id) - assert expected_inputs == actual_inputs - - size_debug_file = open(test_output_dir / "size_debug.html", "w") - size_debug = http_client.tenant_size_debug(tenant_id) - size_debug_file.write(size_debug) + assert size >= initial_size and size - initial_size < 1024 def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path): @@ -190,7 +150,6 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 15 @@ -233,7 +192,6 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = 5 @@ -282,7 +240,6 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): size_debug_file.write(size_debug) -@pytest.mark.skip("This should work, but is left out because assumed covered by other tests") def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): """ gc_horizon = small diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 2cac58dc1a..ac1a747df3 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -103,9 +103,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): n_timelines = 3 - branch_names = [ - "test_safekeepers_many_timelines_{}".format(tlin) for tlin in range(n_timelines) - ] + branch_names = [f"test_safekeepers_many_timelines_{tlin}" for tlin in range(n_timelines)] # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') # that's not really human readable, so the branch names are introduced in Neon CLI. # Neon CLI stores its branch <-> timeline mapping in its internals, @@ -1136,13 +1134,13 @@ def cmp_sk_wal(sks: List[Safekeeper], tenant_id: TenantId, timeline_id: Timeline for f in mismatch: f1 = os.path.join(sk0.timeline_dir(tenant_id, timeline_id), f) f2 = os.path.join(sk.timeline_dir(tenant_id, timeline_id), f) - stdout_filename = "{}.filediff".format(f2) + stdout_filename = f"{f2}.filediff" with open(stdout_filename, "w") as stdout_f: - subprocess.run("xxd {} > {}.hex ".format(f1, f1), shell=True) - subprocess.run("xxd {} > {}.hex ".format(f2, f2), shell=True) + subprocess.run(f"xxd {f1} > {f1}.hex ", shell=True) + subprocess.run(f"xxd {f2} > {f2}.hex ", shell=True) - cmd = "diff {}.hex {}.hex".format(f1, f2) + cmd = f"diff {f1}.hex {f2}.hex" subprocess.run([cmd], stdout=stdout_f, shell=True) assert (mismatch, not_regular) == ( diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 720633189e..5902eb3217 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -10,6 +10,7 @@ import pytest import toml from fixtures.log_helper import getLogger from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId, TimelineId log = getLogger("root.safekeeper_async") @@ -76,20 +77,20 @@ class WorkerStats(object): self.counters[worker_id] += 1 def check_progress(self): - log.debug("Workers progress: {}".format(self.counters)) + log.debug(f"Workers progress: {self.counters}") # every worker should finish at least one tx assert all(cnt > 0 for cnt in self.counters) progress = sum(self.counters) - log.info("All workers made {} transactions".format(progress)) + log.info(f"All workers made {progress} transactions") async def run_random_worker( stats: WorkerStats, endpoint: Endpoint, worker_id, n_accounts, max_transfer ): pg_conn = await endpoint.connect_async() - log.debug("Started worker {}".format(worker_id)) + log.debug(f"Started worker {worker_id}") while stats.running: from_uid = random.randint(0, n_accounts - 1) @@ -99,9 +100,9 @@ async def run_random_worker( await bank_transfer(pg_conn, from_uid, to_uid, amount) stats.inc_progress(worker_id) - log.debug("Executed transfer({}) {} => {}".format(amount, from_uid, to_uid)) + log.debug(f"Executed transfer({amount}) {from_uid} => {to_uid}") - log.debug("Finished worker {}".format(worker_id)) + log.debug(f"Finished worker {worker_id}") await pg_conn.close() @@ -199,7 +200,9 @@ async def run_restarts_under_load( # assert that at least one transaction has completed in every worker stats.check_progress() - victim.start() + # testing #6530, temporary here + # TODO: remove afer partial backup is enabled by default + victim.start(extra_opts=["--partial-backup-enabled", "--partial-backup-timeout=2s"]) log.info("Iterations are finished, exiting coroutines...") stats.running = False @@ -213,6 +216,7 @@ async def run_restarts_under_load( # Restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 + neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() env.neon_cli.create_branch("test_safekeepers_restarts_under_load")