From 7456e5b71cf5a09b1109a3d6c04eadda73457069 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Tue, 28 Mar 2023 17:04:02 +0300 Subject: [PATCH 01/25] Add script to collect state from safekeepers (#3835) Add an ansible script to collect https://github.com/neondatabase/neon/pull/3710 state JSON from all safekeeper nodes and upload them to a postgres table. --- scripts/sk_collect_dumps/.gitignore | 2 ++ scripts/sk_collect_dumps/readme.md | 25 +++++++++++++ scripts/sk_collect_dumps/remote.yaml | 18 ++++++++++ scripts/sk_collect_dumps/upload.sh | 52 ++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+) create mode 100644 scripts/sk_collect_dumps/.gitignore create mode 100644 scripts/sk_collect_dumps/readme.md create mode 100644 scripts/sk_collect_dumps/remote.yaml create mode 100755 scripts/sk_collect_dumps/upload.sh diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore new file mode 100644 index 0000000000..d9d4d0296a --- /dev/null +++ b/scripts/sk_collect_dumps/.gitignore @@ -0,0 +1,2 @@ +result +*.json diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md new file mode 100644 index 0000000000..52b73e9495 --- /dev/null +++ b/scripts/sk_collect_dumps/readme.md @@ -0,0 +1,25 @@ +# Collect /v1/debug_dump from all safekeeper nodes + +1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. +2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. + +## How to use ansible (staging) + +``` +AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +``` + +## How to use ansible (prod) + +``` +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml + +AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +``` + diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml new file mode 100644 index 0000000000..29ce83efde --- /dev/null +++ b/scripts/sk_collect_dumps/remote.yaml @@ -0,0 +1,18 @@ +- name: Fetch state dumps from safekeepers + hosts: safekeepers + gather_facts: False + remote_user: "{{ remote_user }}" + + tasks: + - name: Download file + get_url: + url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false" + dest: "/tmp/{{ inventory_hostname }}.json" + + - name: Fetch file from remote hosts + fetch: + src: "/tmp/{{ inventory_hostname }}.json" + dest: "./result/{{ inventory_hostname }}.json" + flat: yes + fail_on_missing: no + diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh new file mode 100755 index 0000000000..2e54ecba1c --- /dev/null +++ b/scripts/sk_collect_dumps/upload.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +if [ -z "$DB_CONNSTR" ]; then + echo "DB_CONNSTR is not set" + exit 1 +fi + +# Create a temporary table for JSON data +psql $DB_CONNSTR -c 'DROP TABLE IF EXISTS tmp_json' +psql $DB_CONNSTR -c 'CREATE TABLE tmp_json (data jsonb)' + +for file in ./result/*.json; do + echo "$file" + SK_ID=$(jq '.config.id' $file) + echo "SK_ID: $SK_ID" + jq -c ".timelines[] | . + {\"sk_id\": $SK_ID}" $file | psql $DB_CONNSTR -c "\\COPY tmp_json (data) FROM STDIN" +done + +TABLE_NAME=$1 + +if [ -z "$TABLE_NAME" ]; then + echo "TABLE_NAME is not set, skipping conversion to table with typed columns" + echo "Usage: ./upload.sh TABLE_NAME" + exit 0 +fi + +psql $DB_CONNSTR <>'sk_id')::bigint AS sk_id, + (data->>'tenant_id') AS tenant_id, + (data->>'timeline_id') AS timeline_id, + (data->'memory'->>'active')::bool AS active, + (data->'memory'->>'flush_lsn')::bigint AS flush_lsn, + (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn, + (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn, + (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn, + (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn, + (data->'memory'->>'write_lsn')::bigint AS write_lsn, + (data->'memory'->>'num_computes')::bigint AS num_computes, + (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn, + (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno, + (data->'memory'->>'is_cancelled')::bool AS is_cancelled, + (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn, + (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn, + (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term, + (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn, + (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn, + (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn, + (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn +FROM tmp_json +EOF From 5a123b56e5b94f07ed944cacdc09d4eac12bafdd Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 28 Mar 2023 13:46:13 +0300 Subject: [PATCH 02/25] Remove obsolete hack to rename neon-specific GUCs. I checked the console database, we don't have any of these left in production. --- compute_tools/src/pg_helpers.rs | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 79f851ed13..01b192b2de 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -74,18 +74,9 @@ impl GenericOption { /// Represent `GenericOption` as configuration option. pub fn to_pg_setting(&self) -> String { if let Some(val) = &self.value { - // TODO: check in the console DB that we don't have these settings - // set for any non-deleted project and drop this override. - let name = match self.name.as_str() { - "safekeepers" => "neon.safekeepers", - "wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout", - "wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout", - it => it, - }; - match self.vartype.as_ref() { - "string" => format!("{} = '{}'", name, escape_conf_value(val)), - _ => format!("{} = {}", name, val), + "string" => format!("{} = '{}'", self.name, escape_conf_value(val)), + _ => format!("{} = {}", self.name, val), } } else { self.name.to_owned() From b52389f2280e082134643edf75f250ee19002c92 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 20 Mar 2023 22:29:52 +0400 Subject: [PATCH 03/25] Cleanly exit on any shutdown signal in storage_broker. neon_local sends SIGQUIT, which otherwise dumps core by default. Also, remove obsolete install_shutdown_handlers; in all binaries it was overridden by ShutdownSignals::handle later. ref https://github.com/neondatabase/neon/issues/3847 --- libs/utils/src/signals.rs | 23 +---------------------- pageserver/src/bin/pageserver.rs | 11 +++-------- safekeeper/src/bin/safekeeper.rs | 17 +++++++---------- storage_broker/src/bin/storage_broker.rs | 9 +++++++++ 4 files changed, 20 insertions(+), 40 deletions(-) diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index 6586da2339..c37e9aea58 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,25 +1,7 @@ -use signal_hook::flag; use signal_hook::iterator::Signals; -use std::sync::atomic::AtomicBool; -use std::sync::Arc; pub use signal_hook::consts::{signal::*, TERM_SIGNALS}; -pub fn install_shutdown_handlers() -> anyhow::Result { - let term_now = Arc::new(AtomicBool::new(false)); - for sig in TERM_SIGNALS { - // When terminated by a second term signal, exit with exit code 1. - // This will do nothing the first time (because term_now is false). - flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?; - // But this will "arm" the above for the second time, by setting it to true. - // The order of registering these is important, if you put this one first, it will - // first arm and then terminate ‒ all in the first round. - flag::register(*sig, Arc::clone(&term_now))?; - } - - Ok(ShutdownSignals) -} - pub enum Signal { Quit, Interrupt, @@ -39,10 +21,7 @@ impl Signal { pub struct ShutdownSignals; impl ShutdownSignals { - pub fn handle( - self, - mut handler: impl FnMut(Signal) -> anyhow::Result<()>, - ) -> anyhow::Result<()> { + pub fn handle(mut handler: impl FnMut(Signal) -> anyhow::Result<()>) -> anyhow::Result<()> { for raw_signal in Signals::new(TERM_SIGNALS)?.into_iter() { let signal = match raw_signal { SIGINT => Signal::Interrupt, diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 14e86ddcb6..cbfd3e1165 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -24,11 +24,9 @@ use pageserver::{ virtual_file, }; use postgres_backend::AuthType; +use utils::signals::ShutdownSignals; use utils::{ - auth::JwtAuth, - logging, project_git_version, - sentry_init::init_sentry, - signals::{self, Signal}, + auth::JwtAuth, logging, project_git_version, sentry_init::init_sentry, signals::Signal, tcp_listener, }; @@ -263,9 +261,6 @@ fn start_pageserver( info!("Starting pageserver pg protocol handler on {pg_addr}"); let pageserver_listener = tcp_listener::bind(pg_addr)?; - // Install signal handlers - let signals = signals::install_shutdown_handlers()?; - // Launch broker client WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?; @@ -409,7 +404,7 @@ fn start_pageserver( } // All started up! Now just sit and wait for shutdown signal. - signals.handle(|signal| match signal { + ShutdownSignals::handle(|signal| match signal { Signal::Quit => { info!( "Got {}. Terminating in immediate shutdown mode", diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 8966e8c49b..ace921a26d 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -5,6 +5,7 @@ use anyhow::{bail, Context, Result}; use clap::Parser; use remote_storage::RemoteStorageConfig; use toml_edit::Document; +use utils::signals::ShutdownSignals; use std::fs::{self, File}; use std::io::{ErrorKind, Write}; @@ -39,7 +40,7 @@ use utils::{ logging::{self, LogFormat}, project_git_version, sentry_init::init_sentry, - signals, tcp_listener, + tcp_listener, }; const PID_FILE_NAME: &str = "safekeeper.pid"; @@ -216,7 +217,6 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { let timeline_collector = safekeeper::metrics::TimelineCollector::new(); metrics::register_internal(Box::new(timeline_collector))?; - let signals = signals::install_shutdown_handlers()?; let mut threads = vec![]; let (wal_backup_launcher_tx, wal_backup_launcher_rx) = mpsc::channel(100); @@ -274,15 +274,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { set_build_info_metric(GIT_VERSION); // TODO: put more thoughts into handling of failed threads - // We probably should restart them. + // We should catch & die if they are in trouble. - // NOTE: we still have to handle signals like SIGQUIT to prevent coredumps - signals.handle(|signal| { - // TODO: implement graceful shutdown with joining threads etc - info!( - "received {}, terminating in immediate shutdown mode", - signal.name() - ); + // On any shutdown signal, log receival and exit. Additionally, handling + // SIGQUIT prevents coredump. + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); std::process::exit(0); }) } diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 1a0d261184..57f975b0df 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -33,6 +33,7 @@ use tonic::transport::server::Connected; use tonic::Code; use tonic::{Request, Response, Status}; use tracing::*; +use utils::signals::ShutdownSignals; use metrics::{Encoder, TextEncoder}; use storage_broker::metrics::{NUM_PUBS, NUM_SUBS_ALL, NUM_SUBS_TIMELINE}; @@ -437,6 +438,14 @@ async fn main() -> Result<(), Box> { info!("version: {GIT_VERSION}"); ::metrics::set_build_info_metric(GIT_VERSION); + // On any shutdown signal, log receival and exit. + std::thread::spawn(move || { + ShutdownSignals::handle(|signal| { + info!("received {}, terminating", signal.name()); + std::process::exit(0); + }) + }); + let registry = Registry { shared_state: Arc::new(RwLock::new(SharedState::new(args.all_keys_chan_size))), timeline_chan_size: args.timeline_chan_size, From 018c8b0e2ba5a6875bd1e3ebfcfd7e75d3f908f0 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Fri, 17 Mar 2023 18:19:05 +0200 Subject: [PATCH 04/25] Use proper tokens and delimeters when listing S3 --- libs/remote_storage/src/s3_bucket.rs | 5 ++- test_runner/regress/test_remote_storage.py | 46 +++++++++++++++++++++- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 93f5e0596e..a476ff32e0 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -291,6 +291,7 @@ impl RemoteStorage for S3Bucket { .list_objects_v2() .bucket(self.bucket_name.clone()) .set_prefix(self.prefix_in_bucket.clone()) + .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) .set_continuation_token(continuation_token) .send() .await @@ -306,7 +307,7 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.key()?))), ); - match fetch_response.continuation_token { + match fetch_response.next_continuation_token { Some(new_token) => continuation_token = Some(new_token), None => break, } @@ -371,7 +372,7 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), ); - match fetch_response.continuation_token { + match fetch_response.next_continuation_token { Some(new_token) => continuation_token = Some(new_token), None => break, } diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index f6600e8974..b9709d9b83 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -6,7 +6,7 @@ import shutil import threading import time from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Set, Tuple import pytest from fixtures.log_helper import log @@ -717,6 +717,50 @@ def test_empty_branch_remote_storage_upload_on_restart( ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'" +# Test creates >1000 timelines and upload them to the remote storage. +# AWS S3 does not return more than 1000 items and starts paginating, ensure that pageserver paginates correctly. +@pytest.mark.skip("Too slow to run, requires too much disk space to run") +@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) +def test_thousands_of_branches( + neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_compaction_downloads_on_demand_without_image_creation", + ) + + env = neon_env_builder.init_start() + client = env.pageserver.http_client() + expected_timelines: Set[TimelineId] = set([]) + tenant_id = env.initial_tenant + pg = env.postgres.create_start("main", tenant_id=tenant_id) + + max_timelines = 1500 + for i in range(0, max_timelines): + new_timeline_id = TimelineId.generate() + log.info(f"Creating timeline {new_timeline_id}, {i + 1} out of {max_timelines}") + expected_timelines.add(new_timeline_id) + + client.timeline_create(tenant_id, new_timeline_id=new_timeline_id) + client.timeline_checkpoint(tenant_id, new_timeline_id) + wait_for_last_flush_lsn(env, pg, tenant_id, new_timeline_id) + with pg.cursor() as cur: + current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) + wait_for_upload(client, tenant_id, new_timeline_id, current_lsn) + + client.tenant_detach(tenant_id=tenant_id) + client.tenant_attach(tenant_id=tenant_id) + + timelines_after_reattach = set( + [timeline["timeline_id"] for timeline in client.timeline_list(tenant_id=tenant_id)] + ) + + assert ( + expected_timelines == timelines_after_reattach + ), f"Timelines after reattach do not match the ones created initially. \ + Missing timelines: {expected_timelines - timelines_after_reattach}, extra timelines: {timelines_after_reattach - expected_timelines}" + + def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): From 1300dc9239d3e844a1af74db136dabb7353c5776 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Mon, 20 Mar 2023 15:37:11 +0200 Subject: [PATCH 05/25] Replace Python IT test with the Rust one --- Cargo.lock | 22 ++ Cargo.toml | 1 + libs/remote_storage/Cargo.toml | 1 + libs/remote_storage/src/lib.rs | 18 ++ libs/remote_storage/src/s3_bucket.rs | 4 + libs/remote_storage/tests/pagination_tests.rs | 275 ++++++++++++++++++ pageserver/src/config.rs | 1 + test_runner/regress/test_remote_storage.py | 46 +-- 8 files changed, 323 insertions(+), 45 deletions(-) create mode 100644 libs/remote_storage/tests/pagination_tests.rs diff --git a/Cargo.lock b/Cargo.lock index 17aacd8ee7..a19a97a40d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3085,6 +3085,7 @@ dependencies = [ "serde", "serde_json", "tempfile", + "test-context", "tokio", "tokio-util", "toml_edit", @@ -3888,6 +3889,27 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "test-context" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "055831a02a4f5aa28fede67f2902014273eb8c21b958ac5ebbd59b71ef30dbc3" +dependencies = [ + "async-trait", + "futures", + "test-context-macros", +] + +[[package]] +name = "test-context-macros" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8901a55b0a7a06ebc4a674dcca925170da8e613fa3b163a1df804ed10afb154d" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "textwrap" version = "0.16.0" diff --git a/Cargo.toml b/Cargo.toml index e27a50a1cb..09cc150606 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -97,6 +97,7 @@ strum_macros = "0.24" svg_fmt = "0.4.1" sync_wrapper = "0.1.2" tar = "0.4" +test-context = "0.1" thiserror = "1.0" tls-listener = { version = "0.6", features = ["rustls", "hyper-h1"] } tokio = { version = "1.17", features = ["macros"] } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 15812e8439..da15823b69 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -26,3 +26,4 @@ workspace_hack.workspace = true [dev-dependencies] tempfile.workspace = true +test-context.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 901f849801..1d50a777f4 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -39,6 +39,9 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10; /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/ pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100; +/// No limits on the client side, which currenltly means 1000 for AWS S3. +/// https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax +pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option = None; const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/'; @@ -64,6 +67,10 @@ impl RemotePath { pub fn object_name(&self) -> Option<&str> { self.0.file_name().and_then(|os_str| os_str.to_str()) } + + pub fn join(&self, segment: &Path) -> Self { + Self(self.0.join(segment)) + } } /// Storage (potentially remote) API to manage its state. @@ -266,6 +273,7 @@ pub struct S3Config { /// AWS S3 has various limits on its API calls, we need not to exceed those. /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details. pub concurrency_limit: NonZeroUsize, + pub max_keys_per_list_response: Option, } impl Debug for S3Config { @@ -275,6 +283,10 @@ impl Debug for S3Config { .field("bucket_region", &self.bucket_region) .field("prefix_in_bucket", &self.prefix_in_bucket) .field("concurrency_limit", &self.concurrency_limit) + .field( + "max_keys_per_list_response", + &self.max_keys_per_list_response, + ) .finish() } } @@ -303,6 +315,11 @@ impl RemoteStorageConfig { ) .context("Failed to parse 'concurrency_limit' as a positive integer")?; + let max_keys_per_list_response = + parse_optional_integer::("max_keys_per_list_response", toml) + .context("Failed to parse 'max_keys_per_list_response' as a positive integer")? + .or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE); + let storage = match (local_path, bucket_name, bucket_region) { // no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled (None, None, None) => return Ok(None), @@ -324,6 +341,7 @@ impl RemoteStorageConfig { .map(|endpoint| parse_toml_string("endpoint", endpoint)) .transpose()?, concurrency_limit, + max_keys_per_list_response, }), (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from( parse_toml_string("local_path", local_path)?, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index a476ff32e0..d4eb7d9244 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -102,6 +102,7 @@ pub struct S3Bucket { client: Client, bucket_name: String, prefix_in_bucket: Option, + max_keys_per_list_response: Option, // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded. // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold. // The helps to ensure we don't exceed the thresholds. @@ -164,6 +165,7 @@ impl S3Bucket { Ok(Self { client, bucket_name: aws_config.bucket_name.clone(), + max_keys_per_list_response: aws_config.max_keys_per_list_response, prefix_in_bucket, concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())), }) @@ -293,6 +295,7 @@ impl RemoteStorage for S3Bucket { .set_prefix(self.prefix_in_bucket.clone()) .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) .set_continuation_token(continuation_token) + .set_max_keys(self.max_keys_per_list_response) .send() .await .map_err(|e| { @@ -355,6 +358,7 @@ impl RemoteStorage for S3Bucket { .set_prefix(list_prefix.clone()) .set_continuation_token(continuation_token) .delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string()) + .set_max_keys(self.max_keys_per_list_response) .send() .await .map_err(|e| { diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/pagination_tests.rs new file mode 100644 index 0000000000..eb52409c44 --- /dev/null +++ b/libs/remote_storage/tests/pagination_tests.rs @@ -0,0 +1,275 @@ +use std::collections::HashSet; +use std::env; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::ops::ControlFlow; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::UNIX_EPOCH; + +use anyhow::Context; +use remote_storage::{ + GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config, +}; +use test_context::{test_context, AsyncTestContext}; +use tokio::task::JoinSet; +use tracing::{debug, error, info}; + +const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; + +/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. +/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. +/// See the client creation in [`create_s3_client`] for details on the required env vars. +/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the +/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details. +/// +/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_s3_data`] +/// where +/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference +/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket +/// +/// Then, verifies that the client does return correct prefixes when queried: +/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only +/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}` +/// +/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys. +/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3, +/// since current default AWS S3 pagination limit is 1000. +/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax) +/// +/// Lastly, the test attempts to clean up and remove all uploaded S3 files. +/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished. +#[test_context(MaybeEnabledS3)] +#[tokio::test] +async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> { + let ctx = match ctx { + MaybeEnabledS3::Enabled(ctx) => ctx, + MaybeEnabledS3::Disabled => return Ok(()), + MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"), + }; + + let test_client = Arc::clone(&ctx.client_with_excessive_pagination); + let expected_remote_prefixes = ctx.remote_prefixes.clone(); + + let base_prefix = + RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?; + let root_remote_prefixes = test_client + .list_prefixes(None) + .await + .context("client list root prefixes failure")? + .into_iter() + .collect::>(); + assert_eq!( + root_remote_prefixes, HashSet::from([base_prefix.clone()]), + "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}" + ); + + let nested_remote_prefixes = test_client + .list_prefixes(Some(&base_prefix)) + .await + .context("client list nested prefixes failure")? + .into_iter() + .collect::>(); + let remote_only_prefixes = nested_remote_prefixes + .difference(&expected_remote_prefixes) + .collect::>(); + let missing_uploaded_prefixes = expected_remote_prefixes + .difference(&nested_remote_prefixes) + .collect::>(); + assert_eq!( + remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0, + "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}", + ); + + Ok(()) +} + +enum MaybeEnabledS3 { + Enabled(S3WithTestBlobs), + Disabled, + UploadsFailed(anyhow::Error, S3WithTestBlobs), +} + +struct S3WithTestBlobs { + client_with_excessive_pagination: Arc, + base_prefix_str: &'static str, + remote_prefixes: HashSet, + remote_blobs: HashSet, +} + +#[async_trait::async_trait] +impl AsyncTestContext for MaybeEnabledS3 { + async fn setup() -> Self { + utils::logging::init(utils::logging::LogFormat::Test).expect("logging init failed"); + if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() { + info!( + "`{}` env variable is not set, skipping the test", + ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME + ); + return Self::Disabled; + } + + let max_keys_in_list_response = 10; + let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap()); + + let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response) + .context("S3 client creation") + .expect("S3 client creation failed"); + + let base_prefix_str = "test/"; + match upload_s3_data( + &client_with_excessive_pagination, + base_prefix_str, + upload_tasks_count, + ) + .await + { + ControlFlow::Continue(uploads) => { + info!("Remote objects created successfully"); + Self::Enabled(S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }) + } + ControlFlow::Break(uploads) => Self::UploadsFailed( + anyhow::anyhow!("One or multiple blobs failed to upload to S3"), + S3WithTestBlobs { + client_with_excessive_pagination, + base_prefix_str, + remote_prefixes: uploads.prefixes, + remote_blobs: uploads.blobs, + }, + ), + } + } + + async fn teardown(self) { + match self { + Self::Disabled => {} + Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => { + cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await; + } + } + } +} + +fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result> { + let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET") + .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?; + let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION") + .context("`REMOTE_STORAGE_S3_REGION` env var is not set, but real S3 tests are enabled")?; + let random_prefix_part = std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .context("random s3 test prefix part calculation")? + .as_millis(); + let remote_storage_config = RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(100).unwrap(), + max_sync_errors: NonZeroU32::new(5).unwrap(), + storage: RemoteStorageKind::AwsS3(S3Config { + bucket_name: remote_storage_s3_bucket, + bucket_region: remote_storage_s3_region, + prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")), + endpoint: None, + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: Some(max_keys_per_list_response), + }), + }; + Ok(Arc::new( + GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?, + )) +} + +struct Uploads { + prefixes: HashSet, + blobs: HashSet, +} + +async fn upload_s3_data( + client: &Arc, + base_prefix_str: &'static str, + upload_tasks_count: usize, +) -> ControlFlow { + info!("Creating {upload_tasks_count} S3 files"); + let mut upload_tasks = JoinSet::new(); + for i in 1..upload_tasks_count + 1 { + let task_client = Arc::clone(client); + upload_tasks.spawn(async move { + let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/")); + let blob_prefix = RemotePath::new(&prefix) + .with_context(|| format!("{prefix:?} to RemotePath conversion"))?; + let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}"))); + debug!("Creating remote item {i} at path {blob_path:?}"); + + let data = format!("remote blob data {i}").into_bytes(); + let data_len = data.len(); + task_client + .upload( + Box::new(std::io::Cursor::new(data)), + data_len, + &blob_path, + None, + ) + .await?; + + Ok::<_, anyhow::Error>((blob_prefix, blob_path)) + }); + } + + let mut upload_tasks_failed = false; + let mut uploaded_prefixes = HashSet::with_capacity(upload_tasks_count); + let mut uploaded_blobs = HashSet::with_capacity(upload_tasks_count); + while let Some(task_run_result) = upload_tasks.join_next().await { + match task_run_result + .context("task join failed") + .and_then(|task_result| task_result.context("upload task failed")) + { + Ok((upload_prefix, upload_path)) => { + uploaded_prefixes.insert(upload_prefix); + uploaded_blobs.insert(upload_path); + } + Err(e) => { + error!("Upload task failed: {e:?}"); + upload_tasks_failed = true; + } + } + } + + let uploads = Uploads { + prefixes: uploaded_prefixes, + blobs: uploaded_blobs, + }; + if upload_tasks_failed { + ControlFlow::Break(uploads) + } else { + ControlFlow::Continue(uploads) + } +} + +async fn cleanup(client: &Arc, objects_to_delete: HashSet) { + info!( + "Removing {} objects from the remote storage during cleanup", + objects_to_delete.len() + ); + let mut delete_tasks = JoinSet::new(); + for object_to_delete in objects_to_delete { + let task_client = Arc::clone(client); + delete_tasks.spawn(async move { + debug!("Deleting remote item at path {object_to_delete:?}"); + task_client + .delete(&object_to_delete) + .await + .with_context(|| format!("{object_to_delete:?} removal")) + }); + } + + while let Some(task_run_result) = delete_tasks.join_next().await { + match task_run_result { + Ok(task_result) => match task_result { + Ok(()) => {} + Err(e) => error!("Delete task failed: {e:?}"), + }, + Err(join_err) => error!("Delete task did not finish correctly: {join_err}"), + } + } +} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 58a6056385..7293e69f69 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -1238,6 +1238,7 @@ broker_endpoint = '{broker_endpoint}' prefix_in_bucket: Some(prefix_in_bucket.clone()), endpoint: Some(endpoint.clone()), concurrency_limit: s3_concurrency_limit, + max_keys_per_list_response: None, }), }, "Remote storage config should correctly parse the S3 config" diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index b9709d9b83..f6600e8974 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -6,7 +6,7 @@ import shutil import threading import time from pathlib import Path -from typing import Dict, List, Set, Tuple +from typing import Dict, List, Tuple import pytest from fixtures.log_helper import log @@ -717,50 +717,6 @@ def test_empty_branch_remote_storage_upload_on_restart( ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'" -# Test creates >1000 timelines and upload them to the remote storage. -# AWS S3 does not return more than 1000 items and starts paginating, ensure that pageserver paginates correctly. -@pytest.mark.skip("Too slow to run, requires too much disk space to run") -@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3]) -def test_thousands_of_branches( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind -): - neon_env_builder.enable_remote_storage( - remote_storage_kind=remote_storage_kind, - test_name="test_compaction_downloads_on_demand_without_image_creation", - ) - - env = neon_env_builder.init_start() - client = env.pageserver.http_client() - expected_timelines: Set[TimelineId] = set([]) - tenant_id = env.initial_tenant - pg = env.postgres.create_start("main", tenant_id=tenant_id) - - max_timelines = 1500 - for i in range(0, max_timelines): - new_timeline_id = TimelineId.generate() - log.info(f"Creating timeline {new_timeline_id}, {i + 1} out of {max_timelines}") - expected_timelines.add(new_timeline_id) - - client.timeline_create(tenant_id, new_timeline_id=new_timeline_id) - client.timeline_checkpoint(tenant_id, new_timeline_id) - wait_for_last_flush_lsn(env, pg, tenant_id, new_timeline_id) - with pg.cursor() as cur: - current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - wait_for_upload(client, tenant_id, new_timeline_id, current_lsn) - - client.tenant_detach(tenant_id=tenant_id) - client.tenant_attach(tenant_id=tenant_id) - - timelines_after_reattach = set( - [timeline["timeline_id"] for timeline in client.timeline_list(tenant_id=tenant_id)] - ) - - assert ( - expected_timelines == timelines_after_reattach - ), f"Timelines after reattach do not match the ones created initially. \ - Missing timelines: {expected_timelines - timelines_after_reattach}, extra timelines: {timelines_after_reattach - expected_timelines}" - - def wait_upload_queue_empty( client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId ): From 6c84cbbb5877e311c3d7e2959f2bb7214940750a Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 28 Mar 2023 19:02:09 +0300 Subject: [PATCH 06/25] Run new Rust IT test in CI --- .github/workflows/build_and_test.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index d50a42d83c..f2d436c864 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -246,6 +246,13 @@ jobs: run: | ${cov_prefix} cargo test $CARGO_FLAGS + # Run separate tests for real S3 + export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty + export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev + export REMOTE_STORAGE_S3_REGION=eu-central-1 + # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now + ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact + - name: Install rust binaries run: | # Install target binaries From 9d714a8413771ee35ffe237a097411dd497fed27 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Tue, 28 Mar 2023 15:44:15 +0300 Subject: [PATCH 07/25] Split $CARGO_FLAGS and $CARGO_FEATURES to make e2e tests work --- .github/workflows/build_and_test.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index f2d436c864..52e1d94e9b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -184,10 +184,10 @@ jobs: CARGO_FEATURES="--features testing" if [[ $BUILD_TYPE == "debug" ]]; then cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run" - CARGO_FLAGS="--locked $CARGO_FEATURES" + CARGO_FLAGS="--locked" elif [[ $BUILD_TYPE == "release" ]]; then cov_prefix="" - CARGO_FLAGS="--locked --release $CARGO_FEATURES" + CARGO_FLAGS="--locked --release" fi echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV @@ -240,11 +240,11 @@ jobs: - name: Run cargo build run: | - ${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests + ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests - name: Run cargo test run: | - ${cov_prefix} cargo test $CARGO_FLAGS + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -275,7 +275,7 @@ jobs: mkdir -p /tmp/neon/test_bin/ test_exe_paths=$( - ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run | + ${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run | jq -r '.executable | select(. != null)' ) for bin in $test_exe_paths; do From f1b174dc6a86149d92a7d6a84704d70df27805fb Mon Sep 17 00:00:00 2001 From: Vadim Kharitonov Date: Wed, 29 Mar 2023 09:49:23 +0200 Subject: [PATCH 08/25] Update rust version to 1.68.2 --- rust-toolchain.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust-toolchain.toml b/rust-toolchain.toml index 0692340147..c39ba4f417 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.66.1" +channel = "1.68.2" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html From ac9c7e8c4a1a1c4f2984de2ccabcc860dd5905b7 Mon Sep 17 00:00:00 2001 From: Kirill Bulatov Date: Wed, 29 Mar 2023 14:14:56 +0300 Subject: [PATCH 09/25] Replace pin! from tokio to the std one (#3903) With fresh rustc brought by https://github.com/neondatabase/neon/pull/3902, we can use `std::pin::pin!` macro instead of the tokio one. One place did not need the macro at all, other places were adjusted. --- pageserver/src/page_service.rs | 7 +++---- pageserver/src/tenant/timeline.rs | 9 ++++----- pageserver/src/tenant/timeline/eviction_task.rs | 1 - .../timeline/walreceiver/walreceiver_connection.rs | 6 +++--- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index b63ee31d5e..c0e4a2a9cf 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -27,6 +27,7 @@ use pq_proto::FeStartupPacket; use pq_proto::{BeMessage, FeMessage, RowDescriptor}; use std::io; use std::net::TcpListener; +use std::pin::pin; use std::str; use std::str::FromStr; use std::sync::Arc; @@ -466,8 +467,7 @@ impl PageServerHandler { pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_reader = StreamReader::new(copyin_stream(pgb)); - tokio::pin!(copyin_reader); + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); timeline .import_basebackup_from_tar(&mut copyin_reader, base_lsn, &ctx) .await?; @@ -512,8 +512,7 @@ impl PageServerHandler { info!("importing wal"); pgb.write_message_noflush(&BeMessage::CopyInResponse)?; pgb.flush().await?; - let copyin_reader = StreamReader::new(copyin_stream(pgb)); - tokio::pin!(copyin_reader); + let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb))); import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; info!("wal import complete"); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 611c2c27d3..e1db34ec1b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,6 +24,7 @@ use std::collections::HashMap; use std::fs; use std::ops::{Deref, Range}; use std::path::{Path, PathBuf}; +use std::pin::pin; use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; @@ -677,8 +678,7 @@ impl Timeline { let mut failed = 0; - let cancelled = task_mgr::shutdown_watcher(); - tokio::pin!(cancelled); + let mut cancelled = pin!(task_mgr::shutdown_watcher()); loop { tokio::select! { @@ -1837,13 +1837,13 @@ impl Timeline { let mut timeline_state_updates = self.subscribe_for_state_updates(); let self_calculation = Arc::clone(self); - let calculation = async { + let mut calculation = pin!(async { let cancel = cancel.child_token(); let ctx = ctx.attached_child(); self_calculation .calculate_logical_size(lsn, cancel, &ctx) .await - }; + }); let timeline_state_cancellation = async { loop { match timeline_state_updates.changed().await { @@ -1872,7 +1872,6 @@ impl Timeline { "aborted because task_mgr shutdown requested".to_string() }; - tokio::pin!(calculation); loop { tokio::select! { res = &mut calculation => { return res } diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3ec8c30d70..107cd89b90 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -399,7 +399,6 @@ impl Timeline { let mut throwaway_cache = HashMap::new(); let gather = crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx); - tokio::pin!(gather); tokio::select! { _ = cancel.cancelled() => {} diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 7194a4f3ed..9398a7bee9 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -2,6 +2,7 @@ use std::{ error::Error, + pin::pin, str::FromStr, sync::Arc, time::{Duration, SystemTime}, @@ -17,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::normalize_lsn; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_protocol::message::backend::ReplicationMessage; use postgres_types::PgLsn; -use tokio::{pin, select, sync::watch, time}; +use tokio::{select, sync::watch, time}; use tokio_postgres::{replication::ReplicationStream, Client}; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; @@ -187,8 +188,7 @@ pub async fn handle_walreceiver_connection( let query = format!("START_REPLICATION PHYSICAL {startpoint}"); let copy_stream = replication_client.copy_both_simple(&query).await?; - let physical_stream = ReplicationStream::new(copy_stream); - pin!(physical_stream); + let mut physical_stream = pin!(ReplicationStream::new(copy_stream)); let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); From b26c837ed685641b4c324dc47e6eb8da423ad103 Mon Sep 17 00:00:00 2001 From: Gleb Novikov Date: Wed, 29 Mar 2023 19:18:44 +0400 Subject: [PATCH 10/25] Fixed pageserver openapi spec properties reference (#3904) ## Describe your changes In [this linter run](https://github.com/neondatabase/cloud/actions/runs/4553032319/jobs/8029101300?pr=4391) accidentally found out that spec is invalid. Reference other schemas in properties should be done the way I changed. Could not find documentation specifically for schemas embedding in `components.schemas`, but it seems like the approach is inherited from json schema: https://json-schema.org/understanding-json-schema/structuring.html#ref ## Issue ticket number and link - ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] ~If it is a core feature, I have added thorough tests.~ - [ ] ~Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?~ - [ ] ~If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.~ --- pageserver/src/http/openapi_spec.yml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 795c0cd3c4..eda4a60e95 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -873,13 +873,9 @@ components: type: object properties: tenant_specific_overrides: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigInfo" effective_config: - type: object - schema: - $ref: "#/components/schemas/TenantConfigInfo" + $ref: "#/components/schemas/TenantConfigInfo" TimelineInfo: type: object required: From 1c1bb904edc5f7fbd13bc5523d53591759ce93f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Thu, 30 Mar 2023 16:24:47 +0300 Subject: [PATCH 11/25] Rename zenith_* labels to neon_* (#3911) ## Describe your changes Get rid of the legacy labeling. Aslo `neon_region_slug` with the same value as `neon_region` doesn't make much sense, so just drop it. This allows us to drop the relabeling from zenith to neon in the log collector. --- .../helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml | 7 +++---- .../helm-values/dev-us-east-2-beta.neon-proxy-link.yaml | 7 +++---- .../dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 7 +++---- .../helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml | 7 +++---- .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml | 7 +++---- .../prod-eu-central-1-gamma.neon-proxy-scram.yaml | 7 +++---- .../helm-values/prod-us-east-2-delta.neon-proxy-link.yaml | 7 +++---- .../helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml | 7 +++---- .../prod-us-west-2-eta.neon-proxy-scram-legacy.yaml | 7 +++---- .../helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml | 7 +++---- 10 files changed, 30 insertions(+), 40 deletions(-) diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml index ad712c4745..2307856464 100644 --- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml @@ -30,10 +30,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: dev - zenith_region: eu-west-1 - zenith_region_slug: eu-west-1 + neon_service: proxy-scram + neon_env: dev + neon_region: eu-west-1 exposedService: annotations: diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml index 91ddd07eae..feca05aff6 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml @@ -15,10 +15,9 @@ settings: # -- Additional labels for neon-proxy-link pods podLabels: - zenith_service: proxy - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy + neon_env: dev + neon_region: us-east-2 service: type: LoadBalancer diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml index 6ec18ff388..feee1b369a 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml @@ -15,10 +15,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram-legacy - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram-legacy + neon_env: dev + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml index a091be1016..40814e55c9 100644 --- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml +++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml @@ -30,10 +30,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: dev - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram + neon_env: dev + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml index 8d65e94d00..aa5be89101 100644 --- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: ap-southeast-1 - zenith_region_slug: ap-southeast-1 + neon_service: proxy-scram + neon_env: prod + neon_region: ap-southeast-1 exposedService: annotations: diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml index f806b37482..083af6aa2d 100644 --- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: eu-central-1 - zenith_region_slug: eu-central-1 + neon_service: proxy-scram + neon_env: prod + neon_region: eu-central-1 exposedService: annotations: diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml index eff24302bb..30dcefc151 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml @@ -13,10 +13,9 @@ settings: # -- Additional labels for zenith-proxy pods podLabels: - zenith_service: proxy - zenith_env: production - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy + neon_env: production + neon_region: us-east-2 service: type: LoadBalancer diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml index 38719f64e7..40fbc52b39 100644 --- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-east-2 - zenith_region_slug: us-east-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-east-2 exposedService: annotations: diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml index d23ea41bd7..a186fb833f 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-west-2 - zenith_region_slug: us-west-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-west-2 exposedService: annotations: diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml index d5a7d6d575..810a6a5f78 100644 --- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml +++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml @@ -31,10 +31,9 @@ settings: # -- Additional labels for neon-proxy pods podLabels: - zenith_service: proxy-scram - zenith_env: prod - zenith_region: us-west-2 - zenith_region_slug: us-west-2 + neon_service: proxy-scram + neon_env: prod + neon_region: us-west-2 exposedService: annotations: From fa54a57ca29f5d04af06b64f5532b38c5430675b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Thu, 30 Mar 2023 18:38:45 +0200 Subject: [PATCH 12/25] random_init_delay: remove the minimum of 10 seconds (#3914) Before this patch, the range from which the random delay is picked is at minimum 10 seconds. With this patch, they delay is bounded to whatever the given `period` is, and zero, if period id Duration::ZERO. Motivation for this: the disk usage eviction tests that we'll add in https://github.com/neondatabase/neon/pull/3905 need to wait for the disk usage eviction background loop to do its job. They set a period of 1s. It seems wasteful to wait 10 seconds in the tests. Co-authored-by: Joonas Koivunen --- pageserver/src/tenant/tasks.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 20d1d2bfb6..8aeacc12f5 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -244,14 +244,12 @@ pub(crate) async fn random_init_delay( ) -> Result<(), Cancelled> { use rand::Rng; + if period == Duration::ZERO { + return Ok(()); + } + let d = { let mut rng = rand::thread_rng(); - - // gen_range asserts that the range cannot be empty, which it could be because period can - // be set to zero to disable gc or compaction, so lets set it to be at least 10s. - let period = std::cmp::max(period, Duration::from_secs(10)); - - // semi-ok default as the source of jitter rng.gen_range(Duration::ZERO..=period) }; From 41d364a8f19aa2b7a9b529fe54c8c75d0a73d38e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= Date: Thu, 30 Mar 2023 22:02:39 +0300 Subject: [PATCH 13/25] Add more detailed logging to compute_ctl's shutdown (#3915) Currently we don't see from the logs, if shutting down tracing takes long time or not. We do see that shutting down computes gets delayed for some reason and hits thhe grace period limit. Moving the shutdown message to slightly later, when we don't have anything else than just exit left. ## Issue ticket number and link ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- compute_tools/src/bin/compute_ctl.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index b96842e416..f29a576413 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -203,13 +203,14 @@ fn main() -> Result<()> { if delay_exit { info!("giving control plane 30s to collect the error before shutdown"); thread::sleep(Duration::from_secs(30)); - info!("shutting down"); } + info!("shutting down tracing"); // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. tracing_utils::shutdown_tracing(); + info!("shutting down"); exit(exit_code.unwrap_or(1)) } From bf46237fc22800625cf86578ca9027bdfa047d19 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 30 Mar 2023 22:07:19 +0300 Subject: [PATCH 14/25] Fix prefetch for parallel bitmap scan (#3875) ## Describe your changes Fix prefetch for parallel bitmap scan ## Issue ticket number and link ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. --- vendor/postgres-v14 | 2 +- vendor/postgres-v15 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 9fd9794436..757df1dab8 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 9fd9794436d02fbfe68f8fca5beab218907cec41 +Subproject commit 757df1dab82f69bdf69469119420a0bbb307f992 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 257aaefb25..f8a650e49b 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 257aaefb251c5c85c44652c01bf68c43db62748a +Subproject commit f8a650e49b06d39ad131b860117504044b01f312 From a64dd3ecb58f02826d159dfe07a24b7ee52c9b82 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 31 Mar 2023 13:47:57 +0200 Subject: [PATCH 15/25] disk-usage-based layer eviction (#3809) This patch adds a pageserver-global background loop that evicts layers in response to a shortage of available bytes in the $repo/tenants directory's filesystem. The loop runs periodically at a configurable `period`. Each loop iteration uses `statvfs` to determine filesystem-level space usage. It compares the returned usage data against two different types of thresholds. The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. We're good if that second statvfs shows that we're _actually_ below the configured thresholds. If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. There are two thresholds: - `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. If the actual usage is higher, the threshold is exceeded. - `min_avail_bytes` is the absolute available space in bytes. If the actual usage is lower, the threshold is exceeded. The iteration evicts layers in LRU fashion with a reservation of up to `tenant_min_resident_size` bytes of the most recent layers per tenant. The layers not part of the per-tenant reservation are evicted least-recently-used first until we're below all thresholds. The `tenant_min_resident_size` can be overridden per tenant as `min_resident_size_override` (bytes). In addition to the loop, there is also an HTTP endpoint to perform one loop iteration synchronous to the request. The endpoint takes an absolute number of bytes that the iteration needs to evict before pressure is relieved. The tests use this endpoint, which is a great simplification over setting up loopback-mounts in the tests, which would be required to test the statvfs part of the implementation. We will rely on manual testing in staging to test the statvfs parts. The HTTP endpoint is also handy in emergencies where an operator wants the pageserver to evict a given amount of space _now. Hence, it's arguments documented in openapi_spec.yml. The response type isn't documented though because we don't consider it stable. The endpoint should _not_ be used by Console but it could be used by on-call. Co-authored-by: Joonas Koivunen Co-authored-by: Dmitry Rodionov Co-authored-by: Heikki Linnakangas --- .github/ansible/staging.eu-west-1.hosts.yaml | 8 + .github/ansible/staging.us-east-2.hosts.yaml | 8 + Cargo.lock | 2 + control_plane/src/pageserver.rs | 10 + libs/pageserver_api/src/models.rs | 3 + libs/utils/Cargo.toml | 1 + libs/utils/src/lib.rs | 3 + libs/utils/src/serde_percent.rs | 83 +++ libs/utils/src/serde_regex.rs | 60 ++ pageserver/Cargo.toml | 1 + pageserver/src/bin/pageserver.rs | 27 +- pageserver/src/config.rs | 34 + pageserver/src/disk_usage_eviction_task.rs | 689 ++++++++++++++++++ pageserver/src/http/openapi_spec.yml | 25 + pageserver/src/http/routes.rs | 116 ++- pageserver/src/lib.rs | 2 + pageserver/src/statvfs.rs | 150 ++++ pageserver/src/task_mgr.rs | 3 + pageserver/src/tenant.rs | 10 +- pageserver/src/tenant/config.rs | 9 + pageserver/src/tenant/storage_layer.rs | 17 +- pageserver/src/tenant/timeline.rs | 92 +++ .../src/tenant/timeline/eviction_task.rs | 9 +- test_runner/fixtures/neon_fixtures.py | 34 + .../regress/test_disk_usage_eviction.py | 541 ++++++++++++++ 25 files changed, 1919 insertions(+), 18 deletions(-) create mode 100644 libs/utils/src/serde_percent.rs create mode 100644 libs/utils/src/serde_regex.rs create mode 100644 pageserver/src/disk_usage_eviction_task.rs create mode 100644 pageserver/src/statvfs.rs create mode 100644 test_runner/regress/test_disk_usage_eviction.py diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b537795704..e8d0bb1dc7 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index cd8f832af0..4ef51651fc 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,6 +8,14 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 80 + # TODO: learn typical resident-size growth rate [GiB/minute] and configure + # min_avail_bytes such that we have X minutes of headroom. + min_avail_bytes: 0 + # We assume that the worst-case growth rate is small enough that we can + # catch above-threshold conditions by checking every 10s. + period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" diff --git a/Cargo.lock b/Cargo.lock index a19a97a40d..4590e76014 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2474,6 +2474,7 @@ dependencies = [ "strum", "strum_macros", "svg_fmt", + "sync_wrapper", "tempfile", "tenant_size_model", "thiserror", @@ -4556,6 +4557,7 @@ dependencies = [ "once_cell", "pin-project-lite", "rand", + "regex", "routerify", "sentry", "serde", diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 3c66400a05..094069e4c0 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -363,6 +363,11 @@ impl PageServerNode { .map(|x| serde_json::from_str(x)) .transpose() .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .remove("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as integer")?, }; if !settings.is_empty() { bail!("Unrecognized tenant settings: {settings:?}") @@ -435,6 +440,11 @@ impl PageServerNode { .map(|x| serde_json::from_str(x)) .transpose() .context("Failed to parse 'eviction_policy' json")?, + min_resident_size_override: settings + .get("min_resident_size_override") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'min_resident_size_override' as an integer")?, }) .send()? .error_from_body()?; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 0f860d0a6d..98a4b56858 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -120,6 +120,7 @@ pub struct TenantCreateRequest { // We might do that once the eviction feature has stabilizied. // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, + pub min_resident_size_override: Option, } #[serde_as] @@ -165,6 +166,7 @@ pub struct TenantConfigRequest { // We might do that once the eviction feature has stabilizied. // For now, this field is not even documented in the openapi_spec.yml. pub eviction_policy: Option, + pub min_resident_size_override: Option, } impl TenantConfigRequest { @@ -185,6 +187,7 @@ impl TenantConfigRequest { max_lsn_wal_lag: None, trace_read_requests: None, eviction_policy: None, + min_resident_size_override: None, } } } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index b9f67e82f8..391bc52a80 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -19,6 +19,7 @@ jsonwebtoken.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true +regex.workspace = true routerify.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 766d759ab4..d4176911ac 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -51,6 +51,9 @@ pub mod history_buffer; pub mod measured_stream; +pub mod serde_percent; +pub mod serde_regex; + /// use with fail::cfg("$name", "return(2000)") #[macro_export] macro_rules! failpoint_sleep_millis_async { diff --git a/libs/utils/src/serde_percent.rs b/libs/utils/src/serde_percent.rs new file mode 100644 index 0000000000..63b62b5f1e --- /dev/null +++ b/libs/utils/src/serde_percent.rs @@ -0,0 +1,83 @@ +//! A serde::Deserialize type for percentages. +//! +//! See [`Percent`] for details. + +use serde::{Deserialize, Serialize}; + +/// If the value is not an integer between 0 and 100, +/// deserialization fails with a descriptive error. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +#[serde(transparent)] +pub struct Percent(#[serde(deserialize_with = "deserialize_pct_0_to_100")] u8); + +impl Percent { + pub fn get(&self) -> u8 { + self.0 + } +} + +fn deserialize_pct_0_to_100<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let v: u8 = serde::de::Deserialize::deserialize(deserializer)?; + if v > 100 { + return Err(serde::de::Error::custom( + "must be an integer between 0 and 100", + )); + } + Ok(v) +} + +#[cfg(test)] +mod tests { + use super::Percent; + + #[derive(serde::Deserialize, serde::Serialize, Debug, PartialEq, Eq)] + struct Foo { + bar: Percent, + } + + #[test] + fn basics() { + let input = r#"{ "bar": 50 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 50); + } + #[test] + fn null_handling() { + let input = r#"{ "bar": null }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn zero() { + let input = r#"{ "bar": 0 }"#; + let foo: Foo = serde_json::from_str(input).unwrap(); + assert_eq!(foo.bar.get(), 0); + } + #[test] + fn out_of_range_above() { + let input = r#"{ "bar": 101 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn out_of_range_below() { + let input = r#"{ "bar": -1 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn float() { + let input = r#"{ "bar": 50.5 }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } + #[test] + fn string() { + let input = r#"{ "bar": "50 %" }"#; + let res: Result = serde_json::from_str(input); + assert!(res.is_err()); + } +} diff --git a/libs/utils/src/serde_regex.rs b/libs/utils/src/serde_regex.rs new file mode 100644 index 0000000000..95ea4f8e44 --- /dev/null +++ b/libs/utils/src/serde_regex.rs @@ -0,0 +1,60 @@ +//! A `serde::{Deserialize,Serialize}` type for regexes. + +use std::ops::Deref; + +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct Regex( + #[serde( + deserialize_with = "deserialize_regex", + serialize_with = "serialize_regex" + )] + regex::Regex, +); + +fn deserialize_regex<'de, D>(deserializer: D) -> Result +where + D: serde::de::Deserializer<'de>, +{ + let s: String = serde::de::Deserialize::deserialize(deserializer)?; + let re = regex::Regex::new(&s).map_err(serde::de::Error::custom)?; + Ok(re) +} + +fn serialize_regex(re: ®ex::Regex, serializer: S) -> Result +where + S: serde::ser::Serializer, +{ + serializer.collect_str(re.as_str()) +} + +impl Deref for Regex { + type Target = regex::Regex; + + fn deref(&self) -> ®ex::Regex { + &self.0 + } +} + +impl PartialEq for Regex { + fn eq(&self, other: &Regex) -> bool { + // comparing the automatons would be quite complicated + self.as_str() == other.as_str() + } +} + +impl Eq for Regex {} + +#[cfg(test)] +mod tests { + + #[test] + fn roundtrip() { + let input = r#""foo.*bar""#; + let re: super::Regex = serde_json::from_str(input).unwrap(); + assert!(re.is_match("foo123bar")); + assert!(!re.is_match("foo")); + let output = serde_json::to_string(&re).unwrap(); + assert_eq!(output, input); + } +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8d6641a387..0bc7eba95e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -48,6 +48,7 @@ serde_json = { workspace = true, features = ["raw_value"] } serde_with.workspace = true signal-hook.workspace = true svg_fmt.workspace = true +sync_wrapper.workspace = true tokio-tar.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index cbfd3e1165..ed23a18ee0 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -8,6 +8,7 @@ use anyhow::{anyhow, Context}; use clap::{Arg, ArgAction, Command}; use fail::FailScenario; use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp}; +use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task}; use remote_storage::GenericRemoteStorage; use tracing::*; @@ -314,14 +315,34 @@ fn start_pageserver( // Scan the local 'tenants/' directory and start loading the tenants BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(conf, remote_storage.clone()))?; + // shared state between the disk-usage backed eviction background task and the http endpoint + // that allows triggering disk-usage based eviction manually. note that the http endpoint + // is still accessible even if background task is not configured as long as remote storage has + // been configured. + let disk_usage_eviction_state: Arc = Arc::default(); + + if let Some(remote_storage) = &remote_storage { + launch_disk_usage_global_eviction_task( + conf, + remote_storage.clone(), + disk_usage_eviction_state.clone(), + )?; + } + // Start up the service to handle HTTP mgmt API request. We created the // listener earlier already. { let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); - let router = http::make_router(conf, launch_ts, http_auth, remote_storage)? - .build() - .map_err(|err| anyhow!(err))?; + let router = http::make_router( + conf, + launch_ts, + http_auth, + remote_storage, + disk_usage_eviction_state, + )? + .build() + .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); let server = hyper::Server::from_tcp(http_listener)? .serve(service) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 7293e69f69..19f0f22815 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -27,6 +27,7 @@ use utils::{ logging::LogFormat, }; +use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig; use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; @@ -92,6 +93,8 @@ pub mod defaults { #evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' +#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} + # [tenant_config] #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} @@ -104,6 +107,8 @@ pub mod defaults { #image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} #pitr_interval = '{DEFAULT_PITR_INTERVAL}' +#min_resident_size_override = .. # in bytes + # [remote_storage] "### @@ -180,6 +185,8 @@ pub struct PageServerConf { // See the corresponding metric's help string. pub evictions_low_residence_duration_metric_threshold: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, pub ondemand_download_behavior_treat_error_as_warn: bool, @@ -252,6 +259,8 @@ struct PageServerConfigBuilder { evictions_low_residence_duration_metric_threshold: BuilderValue, + disk_usage_based_eviction: BuilderValue>, + test_remote_failures: BuilderValue, ondemand_download_behavior_treat_error_as_warn: BuilderValue, @@ -312,6 +321,8 @@ impl Default for PageServerConfigBuilder { ) .expect("cannot parse DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD")), + disk_usage_based_eviction: Set(None), + test_remote_failures: Set(0), ondemand_download_behavior_treat_error_as_warn: Set(false), @@ -431,6 +442,10 @@ impl PageServerConfigBuilder { self.evictions_low_residence_duration_metric_threshold = BuilderValue::Set(value); } + pub fn disk_usage_based_eviction(&mut self, value: Option) { + self.disk_usage_based_eviction = BuilderValue::Set(value); + } + pub fn ondemand_download_behavior_treat_error_as_warn( &mut self, ondemand_download_behavior_treat_error_as_warn: bool, @@ -515,6 +530,9 @@ impl PageServerConfigBuilder { .ok_or(anyhow!( "missing evictions_low_residence_duration_metric_threshold" ))?, + disk_usage_based_eviction: self + .disk_usage_based_eviction + .ok_or(anyhow!("missing disk_usage_based_eviction"))?, test_remote_failures: self .test_remote_failures .ok_or(anyhow!("missing test_remote_failuers"))?, @@ -704,6 +722,12 @@ impl PageServerConf { builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), "evictions_low_residence_duration_metric_threshold" => builder.evictions_low_residence_duration_metric_threshold(parse_toml_duration(key, item)?), + "disk_usage_based_eviction" => { + tracing::info!("disk_usage_based_eviction: {:#?}", &item); + builder.disk_usage_based_eviction( + toml_edit::de::from_item(item.clone()) + .context("parse disk_usage_based_eviction")?) + }, "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), _ => bail!("unrecognized pageserver option '{key}'"), } @@ -808,6 +832,13 @@ impl PageServerConf { ); } + if let Some(item) = item.get("min_resident_size_override") { + t_conf.min_resident_size_override = Some( + toml_edit::de::from_item(item.clone()) + .context("parse min_resident_size_override")?, + ); + } + Ok(t_conf) } @@ -850,6 +881,7 @@ impl PageServerConf { defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, ) .unwrap(), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, } @@ -1058,6 +1090,7 @@ log_format = 'json' evictions_low_residence_duration_metric_threshold: humantime::parse_duration( defaults::DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD )?, + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, @@ -1112,6 +1145,7 @@ log_format = 'json' metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), synthetic_size_calculation_interval: Duration::from_secs(333), evictions_low_residence_duration_metric_threshold: Duration::from_secs(444), + disk_usage_based_eviction: None, test_remote_failures: 0, ondemand_download_behavior_treat_error_as_warn: false, }, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs new file mode 100644 index 0000000000..eeeb6fda89 --- /dev/null +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -0,0 +1,689 @@ +//! This module implements the pageserver-global disk-usage-based layer eviction task. +//! +//! # Mechanics +//! +//! Function `launch_disk_usage_global_eviction_task` starts a pageserver-global background +//! loop that evicts layers in response to a shortage of available bytes +//! in the $repo/tenants directory's filesystem. +//! +//! The loop runs periodically at a configurable `period`. +//! +//! Each loop iteration uses `statvfs` to determine filesystem-level space usage. +//! It compares the returned usage data against two different types of thresholds. +//! The iteration tries to evict layers until app-internal accounting says we should be below the thresholds. +//! We cross-check this internal accounting with the real world by making another `statvfs` at the end of the iteration. +//! We're good if that second statvfs shows that we're _actually_ below the configured thresholds. +//! If we're still above one or more thresholds, we emit a warning log message, leaving it to the operator to investigate further. +//! +//! # Eviction Policy +//! +//! There are two thresholds: +//! `max_usage_pct` is the relative available space, expressed in percent of the total filesystem space. +//! If the actual usage is higher, the threshold is exceeded. +//! `min_avail_bytes` is the absolute available space in bytes. +//! If the actual usage is lower, the threshold is exceeded. +//! If either of these thresholds is exceeded, the system is considered to have "disk pressure", and eviction +//! is performed on the next iteration, to release disk space and bring the usage below the thresholds again. +//! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant. +//! The reservation is to keep the most recently accessed X bytes per tenant resident. +//! If we cannot relieve pressure by evicting layers outside of the reservation, we +//! start evicting layers that are part of the reservation, LRU first. +//! +//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size` +//! throughout the code, but, no actual variable carries that name. +//! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`. +//! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress +//! during page reconstruction. +//! An alternative default for all tenants can be specified in the `tenant_config` section of the config. +//! Lastly, each tenant can have an override in their respective tenant config (`min_resident_size_override`). + +// Implementation notes: +// - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl +// reading these fields. We use the Debug impl for semi-structured logging, though. + +use std::{ + collections::HashMap, + path::Path, + sync::Arc, + time::{Duration, SystemTime}, +}; + +use anyhow::Context; +use remote_storage::GenericRemoteStorage; +use serde::{Deserialize, Serialize}; +use tokio::time::Instant; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, instrument, warn, Instrument}; +use utils::serde_percent::Percent; + +use crate::{ + config::PageServerConf, + task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, + tenant::{self, storage_layer::PersistentLayer, Timeline}, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, +} + +#[derive(Default)] +pub struct State { + /// Exclude http requests and background task from running at the same time. + mutex: tokio::sync::Mutex<()>, +} + +pub fn launch_disk_usage_global_eviction_task( + conf: &'static PageServerConf, + storage: GenericRemoteStorage, + state: Arc, +) -> anyhow::Result<()> { + let Some(task_config) = &conf.disk_usage_based_eviction else { + info!("disk usage based eviction task not configured"); + return Ok(()); + }; + + info!("launching disk usage based eviction task"); + + task_mgr::spawn( + BACKGROUND_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "disk usage based eviction", + false, + async move { + disk_usage_eviction_task( + &state, + task_config, + storage, + &conf.tenants_path(), + task_mgr::shutdown_token(), + ) + .await; + info!("disk usage based eviction task finishing"); + Ok(()) + }, + ); + + Ok(()) +} + +#[instrument(skip_all)] +async fn disk_usage_eviction_task( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: GenericRemoteStorage, + tenants_dir: &Path, + cancel: CancellationToken, +) { + use crate::tenant::tasks::random_init_delay; + { + if random_init_delay(task_config.period, &cancel) + .await + .is_err() + { + info!("shutting down"); + return; + } + } + + let mut iteration_no = 0; + loop { + iteration_no += 1; + let start = Instant::now(); + + async { + let res = disk_usage_eviction_task_iteration( + state, + task_config, + &storage, + tenants_dir, + &cancel, + ) + .await; + + match res { + Ok(()) => {} + Err(e) => { + // these stat failures are expected to be very rare + warn!("iteration failed, unexpected error: {e:#}"); + } + } + } + .instrument(tracing::info_span!("iteration", iteration_no)) + .await; + + let sleep_until = start + task_config.period; + tokio::select! { + _ = tokio::time::sleep_until(sleep_until) => {}, + _ = cancel.cancelled() => { + info!("shutting down"); + break + } + } + } +} + +pub trait Usage: Clone + Copy + std::fmt::Debug { + fn has_pressure(&self) -> bool; + fn add_available_bytes(&mut self, bytes: u64); +} + +async fn disk_usage_eviction_task_iteration( + state: &State, + task_config: &DiskUsageEvictionTaskConfig, + storage: &GenericRemoteStorage, + tenants_dir: &Path, + cancel: &CancellationToken, +) -> anyhow::Result<()> { + let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) + .context("get filesystem-level disk usage before evictions")?; + let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; + match res { + Ok(outcome) => { + debug!(?outcome, "disk_usage_eviction_iteration finished"); + match outcome { + IterationOutcome::NoPressure | IterationOutcome::Cancelled => { + // nothing to do, select statement below will handle things + } + IterationOutcome::Finished(outcome) => { + // Verify with statvfs whether we made any real progress + let after = filesystem_level_usage::get(tenants_dir, task_config) + // It's quite unlikely to hit the error here. Keep the code simple and bail out. + .context("get filesystem-level disk usage after evictions")?; + + debug!(?after, "disk usage"); + + if after.has_pressure() { + // Don't bother doing an out-of-order iteration here now. + // In practice, the task period is set to a value in the tens-of-seconds range, + // which will cause another iteration to happen soon enough. + // TODO: deltas between the three different usages would be helpful, + // consider MiB, GiB, TiB + warn!(?outcome, ?after, "disk usage still high"); + } else { + info!(?outcome, ?after, "disk usage pressure relieved"); + } + } + } + } + Err(e) => { + error!("disk_usage_eviction_iteration failed: {:#}", e); + } + } + + Ok(()) +} + +#[derive(Debug, Serialize)] +#[allow(clippy::large_enum_variant)] +pub enum IterationOutcome { + NoPressure, + Cancelled, + Finished(IterationOutcomeFinished), +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +pub struct IterationOutcomeFinished { + /// The actual usage observed before we started the iteration. + before: U, + /// The expected value for `after`, according to internal accounting, after phase 1. + planned: PlannedUsage, + /// The outcome of phase 2, where we actually do the evictions. + /// + /// If all layers that phase 1 planned to evict _can_ actually get evicted, this will + /// be the same as `planned`. + assumed: AssumedUsage, +} + +#[derive(Debug, Serialize)] +#[allow(dead_code)] +struct AssumedUsage { + /// The expected value for `after`, after phase 2. + projected_after: U, + /// The layers we failed to evict during phase 2. + failed: LayerCount, +} + +#[allow(dead_code)] +#[derive(Debug, Serialize)] +struct PlannedUsage { + respecting_tenant_min_resident_size: U, + fallback_to_global_lru: Option, +} + +#[allow(dead_code)] +#[derive(Debug, Default, Serialize)] +struct LayerCount { + file_sizes: u64, + count: usize, +} + +pub async fn disk_usage_eviction_task_iteration_impl( + state: &State, + storage: &GenericRemoteStorage, + usage_pre: U, + cancel: &CancellationToken, +) -> anyhow::Result> { + // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) + let _g = state + .mutex + .try_lock() + .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; + + debug!(?usage_pre, "disk usage"); + + if !usage_pre.has_pressure() { + return Ok(IterationOutcome::NoPressure); + } + + warn!( + ?usage_pre, + "running disk usage based eviction due to pressure" + ); + + let candidates = match collect_eviction_candidates(cancel).await? { + EvictionCandidates::Cancelled => { + return Ok(IterationOutcome::Cancelled); + } + EvictionCandidates::Finished(partitioned) => partitioned, + }; + + // Debug-log the list of candidates + let now = SystemTime::now(); + for (i, (partition, candidate)) in candidates.iter().enumerate() { + debug!( + "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", + i + 1, + candidates.len(), + candidate.layer.file_size(), + now.duration_since(candidate.last_activity_ts) + .unwrap() + .as_micros(), + partition, + candidate.layer.get_tenant_id(), + candidate.layer.get_timeline_id(), + candidate.layer.filename().file_name(), + ); + } + + // phase1: select victims to relieve pressure + // + // Walk through the list of candidates, until we have accumulated enough layers to get + // us back under the pressure threshold. 'usage_planned' is updated so that it tracks + // how much disk space would be used after evicting all the layers up to the current + // point in the list. The layers are collected in 'batched', grouped per timeline. + // + // If we get far enough in the list that we start to evict layers that are below + // the tenant's min-resident-size threshold, print a warning, and memorize the disk + // usage at that point, in 'usage_planned_min_resident_size_respecting'. + let mut batched: HashMap<_, Vec>> = HashMap::new(); + let mut warned = None; + let mut usage_planned = usage_pre; + for (i, (partition, candidate)) in candidates.into_iter().enumerate() { + if !usage_planned.has_pressure() { + debug!( + no_candidates_evicted = i, + "took enough candidates for pressure to be relieved" + ); + break; + } + + if partition == MinResidentSizePartition::Below && warned.is_none() { + warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); + warned = Some(usage_planned); + } + + usage_planned.add_available_bytes(candidate.layer.file_size()); + + batched + .entry(TimelineKey(candidate.timeline)) + .or_default() + .push(candidate.layer); + } + + let usage_planned = match warned { + Some(respecting_tenant_min_resident_size) => PlannedUsage { + respecting_tenant_min_resident_size, + fallback_to_global_lru: Some(usage_planned), + }, + None => PlannedUsage { + respecting_tenant_min_resident_size: usage_planned, + fallback_to_global_lru: None, + }, + }; + debug!(?usage_planned, "usage planned"); + + // phase2: evict victims batched by timeline + + // After the loop, `usage_assumed` is the post-eviction usage, + // according to internal accounting. + let mut usage_assumed = usage_pre; + let mut evictions_failed = LayerCount::default(); + for (timeline, batch) in batched { + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + let batch_size = batch.len(); + + debug!(%timeline_id, "evicting batch for timeline"); + + async { + let results = timeline.evict_layers(storage, &batch, cancel.clone()).await; + + match results { + Err(e) => { + warn!("failed to evict batch: {:#}", e); + } + Ok(results) => { + assert_eq!(results.len(), batch.len()); + for (result, layer) in results.into_iter().zip(batch.iter()) { + match result { + Some(Ok(true)) => { + usage_assumed.add_available_bytes(layer.file_size()); + } + Some(Ok(false)) => { + // this is: + // - Replacement::{NotFound, Unexpected} + // - it cannot be is_remote_layer, filtered already + evictions_failed.file_sizes += layer.file_size(); + evictions_failed.count += 1; + } + None => { + assert!(cancel.is_cancelled()); + return; + } + Some(Err(e)) => { + // we really shouldn't be getting this, precondition failure + error!("failed to evict layer: {:#}", e); + } + } + } + } + } + } + .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size)) + .await; + + if cancel.is_cancelled() { + return Ok(IterationOutcome::Cancelled); + } + } + + Ok(IterationOutcome::Finished(IterationOutcomeFinished { + before: usage_pre, + planned: usage_planned, + assumed: AssumedUsage { + projected_after: usage_assumed, + failed: evictions_failed, + }, + })) +} + +#[derive(Clone)] +struct EvictionCandidate { + timeline: Arc, + layer: Arc, + last_activity_ts: SystemTime, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +enum MinResidentSizePartition { + Above, + Below, +} + +enum EvictionCandidates { + Cancelled, + Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), +} + +/// Gather the eviction candidates. +/// +/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction +/// order. A caller that evicts in that order, until pressure is relieved, implements +/// the eviction policy outlined in the module comment. +/// +/// # Example +/// +/// Imagine that there are two tenants, A and B, with five layers each, a-e. +/// Each layer has size 100, and both tenant's min_resident_size is 150. +/// The eviction order would be +/// +/// ```text +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e +/// ``` +/// +/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. +/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size. +/// +/// But, if we need to evict 900 bytes to relieve pressure, we'd evict +/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition +/// after exhauting the `Above` partition. +/// So, we did not respect each tenant's min_resident_size. +async fn collect_eviction_candidates( + cancel: &CancellationToken, +) -> anyhow::Result { + // get a snapshot of the list of tenants + let tenants = tenant::mgr::list_tenants() + .await + .context("get list of tenants")?; + + let mut candidates = Vec::new(); + + for (tenant_id, _state) in &tenants { + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await { + Ok(tenant) => tenant, + Err(e) => { + // this can happen if tenant has lifecycle transition after we fetched it + debug!("failed to get tenant: {e:#}"); + continue; + } + }; + + // collect layers from all timelines in this tenant + // + // If one of the timelines becomes `!is_active()` during the iteration, + // for example because we're shutting down, then `max_layer_size` can be too small. + // That's OK. This code only runs under a disk pressure situation, and being + // a little unfair to tenants during shutdown in such a situation is tolerable. + let mut tenant_candidates = Vec::new(); + let mut max_layer_size = 0; + for tl in tenant.list_timelines() { + if !tl.is_active() { + continue; + } + let info = tl.get_local_layers_for_disk_usage_eviction(); + debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend( + info.resident_layers + .into_iter() + .map(|layer_infos| (tl.clone(), layer_infos)), + ); + max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); + + if cancel.is_cancelled() { + return Ok(EvictionCandidates::Cancelled); + } + } + + // `min_resident_size` defaults to maximum layer file size of the tenant. + // This ensures that each tenant can have at least one layer resident at a given time, + // ensuring forward progress for a single Timeline::get in that tenant. + // It's a questionable heuristic since, usually, there are many Timeline::get + // requests going on for a tenant, and, at least in Neon prod, the median + // layer file size is much smaller than the compaction target size. + // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. + // That's what's typically used by the various background loops. + // + // The default can be overriden with a fixed value in the tenant conf. + // A default override can be put in the default tenant conf in the pageserver.toml. + let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { + debug!( + tenant_id=%tenant.tenant_id(), + overriden_size=s, + "using overridden min resident size for tenant" + ); + s + } else { + debug!( + tenant_id=%tenant.tenant_id(), + max_layer_size, + "using max layer size as min_resident_size for tenant", + ); + max_layer_size + }; + + // Sort layers most-recently-used first, then partition by + // cumsum above/below min_resident_size. + tenant_candidates + .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); + let mut cumsum: i128 = 0; + for (timeline, layer_info) in tenant_candidates.into_iter() { + let file_size = layer_info.file_size(); + let candidate = EvictionCandidate { + timeline, + last_activity_ts: layer_info.last_activity_ts, + layer: layer_info.layer, + }; + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above + } else { + MinResidentSizePartition::Below + }; + candidates.push((partition, candidate)); + cumsum += i128::from(file_size); + } + } + + debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); + candidates + .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + Ok(EvictionCandidates::Finished(candidates)) +} + +struct TimelineKey(Arc); + +impl PartialEq for TimelineKey { + fn eq(&self, other: &Self) -> bool { + Arc::ptr_eq(&self.0, &other.0) + } +} + +impl Eq for TimelineKey {} + +impl std::hash::Hash for TimelineKey { + fn hash(&self, state: &mut H) { + Arc::as_ptr(&self.0).hash(state); + } +} + +impl std::ops::Deref for TimelineKey { + type Target = Timeline; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +mod filesystem_level_usage { + use std::path::Path; + + use anyhow::Context; + + use crate::statvfs::Statvfs; + + use super::DiskUsageEvictionTaskConfig; + + #[derive(Debug, Clone, Copy)] + #[allow(dead_code)] + pub struct Usage<'a> { + config: &'a DiskUsageEvictionTaskConfig, + + /// Filesystem capacity + total_bytes: u64, + /// Free filesystem space + avail_bytes: u64, + } + + impl super::Usage for Usage<'_> { + fn has_pressure(&self) -> bool { + let usage_pct = + (100.0 * (1.0 - ((self.avail_bytes as f64) / (self.total_bytes as f64)))) as u64; + + let pressures = [ + ( + "min_avail_bytes", + self.avail_bytes < self.config.min_avail_bytes, + ), + ( + "max_usage_pct", + usage_pct > self.config.max_usage_pct.get() as u64, + ), + ]; + + pressures.into_iter().any(|(_, has_pressure)| has_pressure) + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.avail_bytes += bytes; + } + } + + pub fn get<'a>( + tenants_dir: &Path, + config: &'a DiskUsageEvictionTaskConfig, + ) -> anyhow::Result> { + let mock_config = { + #[cfg(feature = "testing")] + { + config.mock_statvfs.as_ref() + } + #[cfg(not(feature = "testing"))] + { + None + } + }; + + let stat = Statvfs::get(tenants_dir, mock_config) + .context("statvfs failed, presumably directory got unlinked")?; + + // https://unix.stackexchange.com/a/703650 + let blocksize = if stat.fragment_size() > 0 { + stat.fragment_size() + } else { + stat.block_size() + }; + + // use blocks_available (b_avail) since, pageserver runs as unprivileged user + let avail_bytes = stat.blocks_available() * blocksize; + let total_bytes = stat.blocks() * blocksize; + + Ok(Usage { + config, + total_bytes, + avail_bytes, + }) + } +} diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index eda4a60e95..478e9d228a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -27,6 +27,31 @@ paths: id: type: integer + /v1/disk_usage_eviction/run: + put: + description: Do an iteration of disk-usage-based eviction to evict a given amount of disk space. + security: [] + requestBody: + content: + application/json: + schema: + type: object + required: + - evict_bytes + properties: + evict_bytes: + type: integer + responses: + "200": + description: | + The run completed. + This does not necessarily mean that we actually evicted `evict_bytes`. + Examine the returned object for detail, or, just watch the actual effect of the call using `du` or `df`. + content: + application/json: + schema: + type: object + /v1/tenant/{tenant_id}: parameters: - name: tenant_id diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b0addc82f1..2db60f557d 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -18,6 +18,7 @@ use super::models::{ TimelineCreateRequest, TimelineGcRequest, TimelineInfo, }; use crate::context::{DownloadBehavior, RequestContext}; +use crate::disk_usage_eviction_task; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::TenantConfOpt; @@ -48,6 +49,7 @@ struct State { auth: Option>, allowlist_routes: Vec, remote_storage: Option, + disk_usage_eviction_state: Arc, } impl State { @@ -55,6 +57,7 @@ impl State { conf: &'static PageServerConf, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result { let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"] .iter() @@ -65,6 +68,7 @@ impl State { auth, allowlist_routes, remote_storage, + disk_usage_eviction_state, }) } } @@ -775,6 +779,8 @@ async fn tenant_create_handler(mut request: Request) -> Result) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; + + let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) + .await + .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; + + tenant.set_broken("broken from test"); + + json_response(StatusCode::OK, ()) +} + #[cfg(feature = "testing")] async fn failpoints_handler(mut request: Request) -> Result, ApiError> { if !fail::has_failpoints() { @@ -1063,6 +1085,89 @@ async fn always_panic_handler(req: Request) -> Result, ApiE json_response(StatusCode::NO_CONTENT, ()) } +async fn disk_usage_eviction_run(mut r: Request) -> Result, ApiError> { + check_permission(&r, None)?; + + #[derive(Debug, Clone, Copy, serde::Serialize, serde::Deserialize)] + struct Config { + /// How many bytes to evict before reporting that pressure is relieved. + evict_bytes: u64, + } + + #[derive(Debug, Clone, Copy, serde::Serialize)] + struct Usage { + // remains unchanged after instantiation of the struct + config: Config, + // updated by `add_available_bytes` + freed_bytes: u64, + } + + impl crate::disk_usage_eviction_task::Usage for Usage { + fn has_pressure(&self) -> bool { + self.config.evict_bytes > self.freed_bytes + } + + fn add_available_bytes(&mut self, bytes: u64) { + self.freed_bytes += bytes; + } + } + + let config = json_request::(&mut r) + .await + .map_err(|_| ApiError::BadRequest(anyhow::anyhow!("invalid JSON body")))?; + + let usage = Usage { + config, + freed_bytes: 0, + }; + + use crate::task_mgr::MGMT_REQUEST_RUNTIME; + + let (tx, rx) = tokio::sync::oneshot::channel(); + + let state = get_state(&r); + + let Some(storage) = state.remote_storage.clone() else { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "remote storage not configured, cannot run eviction iteration" + ))) + }; + + let state = state.disk_usage_eviction_state.clone(); + + let cancel = CancellationToken::new(); + let child_cancel = cancel.clone(); + let _g = cancel.drop_guard(); + + crate::task_mgr::spawn( + MGMT_REQUEST_RUNTIME.handle(), + TaskKind::DiskUsageEviction, + None, + None, + "ondemand disk usage eviction", + false, + async move { + let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( + &state, + &storage, + usage, + &child_cancel, + ) + .await; + + info!(?res, "disk_usage_eviction_task_iteration_impl finished"); + + let _ = tx.send(res); + Ok(()) + } + .in_current_span(), + ); + + let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, response) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1075,6 +1180,7 @@ pub fn make_router( launch_ts: &'static LaunchTimestamp, auth: Option>, remote_storage: Option, + disk_usage_eviction_state: Arc, ) -> anyhow::Result> { let spec = include_bytes!("openapi_spec.yml"); let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc"); @@ -1119,7 +1225,8 @@ pub fn make_router( Ok(router .data(Arc::new( - State::new(conf, auth, remote_storage).context("Failed to initialize router state")?, + State::new(conf, auth, remote_storage, disk_usage_eviction_state) + .context("Failed to initialize router state")?, )) .get("/v1/status", |r| RequestSpan(status_handler).handle(r)) .put( @@ -1200,6 +1307,13 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", |r| RequestSpan(evict_timeline_layer_handler).handle(r), ) + .put("/v1/disk_usage_eviction/run", |r| { + RequestSpan(disk_usage_eviction_run).handle(r) + }) + .put( + "/v1/tenant/:tenant_id/break", + testing_api!("set tenant state to broken", handle_tenant_break), + ) .get("/v1/panic", |r| RequestSpan(always_panic_handler).handle(r)) .any(handler_404)) } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 09e21ae755..278658eba3 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -4,6 +4,7 @@ pub mod broker_client; pub mod config; pub mod consumption_metrics; pub mod context; +pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; pub mod keyspace; @@ -12,6 +13,7 @@ pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; pub mod repository; +pub(crate) mod statvfs; pub mod task_mgr; pub mod tenant; pub mod trace; diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs new file mode 100644 index 0000000000..28d950b5e6 --- /dev/null +++ b/pageserver/src/statvfs.rs @@ -0,0 +1,150 @@ +//! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking. + +use std::path::Path; + +pub enum Statvfs { + Real(nix::sys::statvfs::Statvfs), + Mock(mock::Statvfs), +} + +// NB: on macOS, the block count type of struct statvfs is u32. +// The workaround seems to be to use the non-standard statfs64 call. +// Sincce it should only be a problem on > 2TiB disks, let's ignore +// the problem for now and upcast to u64. +impl Statvfs { + pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result { + if let Some(mocked) = mocked { + Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?)) + } else { + Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?)) + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(), + Statvfs::Mock(stat) => stat.blocks, + } + } + + // NB: allow() because the block count type is u32 on macOS. + #[allow(clippy::useless_conversion)] + pub fn blocks_available(&self) -> u64 { + match self { + Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(), + Statvfs::Mock(stat) => stat.blocks_available, + } + } + + pub fn fragment_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.fragment_size(), + Statvfs::Mock(stat) => stat.fragment_size, + } + } + + pub fn block_size(&self) -> u64 { + match self { + Statvfs::Real(stat) => stat.block_size(), + Statvfs::Mock(stat) => stat.block_size, + } + } +} + +pub mod mock { + use anyhow::Context; + use regex::Regex; + use std::path::Path; + use tracing::log::info; + + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + Failure { + mocked_error: MockedError, + }, + } + + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + + pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result { + info!("running mocked statvfs"); + + match behavior { + Behavior::Success { + blocksize, + total_blocks, + ref name_filter, + } => { + let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap(); + + // round it up to the nearest block multiple + let used_blocks = (used_bytes + (blocksize - 1)) / blocksize; + + if used_blocks > *total_blocks { + panic!( + "mocking error: used_blocks > total_blocks: {used_blocks} > {total_blocks}" + ); + } + + let avail_blocks = total_blocks - used_blocks; + + Ok(Statvfs { + blocks: *total_blocks, + blocks_available: avail_blocks, + fragment_size: *blocksize, + block_size: *blocksize, + }) + } + Behavior::Failure { mocked_error } => Err((*mocked_error).into()), + } + } + + fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result { + let mut total = 0; + for entry in walkdir::WalkDir::new(path) { + let entry = entry?; + if !entry.file_type().is_file() { + continue; + } + if !name_filter + .as_ref() + .map(|filter| filter.is_match(entry.file_name().to_str().unwrap())) + .unwrap_or(true) + { + continue; + } + total += entry + .metadata() + .with_context(|| format!("get metadata of {:?}", entry.path()))? + .len(); + } + Ok(total) + } + + pub struct Statvfs { + pub blocks: u64, + pub blocks_available: u64, + pub fragment_size: u64, + pub block_size: u64, + } +} diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 44b1bbb06d..82aebc6c07 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -234,6 +234,9 @@ pub enum TaskKind { // Eviction. One per timeline. Eviction, + /// See [`crate::disk_usage_eviction_task`]. + DiskUsageEviction, + // Initial logical size calculation InitialLogicalSizeCalculation, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2c5226e5bc..7fac7d2ac0 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -95,7 +95,7 @@ mod timeline; pub mod size; -pub use timeline::{PageReconstructError, Timeline}; +pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline}; // re-export this function so that page_cache.rs can use it. pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file; @@ -1706,6 +1706,13 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.trace_read_requests) } + pub fn get_min_resident_size_override(&self) -> Option { + let tenant_conf = self.tenant_conf.read().unwrap(); + tenant_conf + .min_resident_size_override + .or(self.conf.default_tenant_conf.min_resident_size_override) + } + pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) { *self.tenant_conf.write().unwrap() = new_tenant_conf; } @@ -2783,6 +2790,7 @@ pub mod harness { max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag), trace_read_requests: Some(tenant_conf.trace_read_requests), eviction_policy: Some(tenant_conf.eviction_policy), + min_resident_size_override: tenant_conf.min_resident_size_override, } } } diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 48cb6be121..cdabb23a7b 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -92,6 +92,7 @@ pub struct TenantConf { pub max_lsn_wal_lag: NonZeroU64, pub trace_read_requests: bool, pub eviction_policy: EvictionPolicy, + pub min_resident_size_override: Option, } /// Same as TenantConf, but this struct preserves the information about @@ -159,6 +160,10 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] #[serde(default)] pub eviction_policy: Option, + + #[serde(skip_serializing_if = "Option::is_none")] + #[serde(default)] + pub min_resident_size_override: Option, } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] @@ -220,6 +225,9 @@ impl TenantConfOpt { .trace_read_requests .unwrap_or(global_conf.trace_read_requests), eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy), + min_resident_size_override: self + .min_resident_size_override + .or(global_conf.min_resident_size_override), } } } @@ -251,6 +259,7 @@ impl Default for TenantConf { .expect("cannot parse default max walreceiver Lsn wal lag"), trace_read_requests: false, eviction_policy: EvictionPolicy::NoEviction, + min_resident_size_override: None, } } } diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index c36b6121c0..2ee723e7c3 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -121,10 +121,10 @@ struct LayerAccessStatsInner { } #[derive(Debug, Clone, Copy)] -pub(super) struct LayerAccessStatFullDetails { - pub(super) when: SystemTime, - pub(super) task_kind: TaskKind, - pub(super) access_kind: LayerAccessKind, +pub(crate) struct LayerAccessStatFullDetails { + pub(crate) when: SystemTime, + pub(crate) task_kind: TaskKind, + pub(crate) access_kind: LayerAccessKind, } #[derive(Clone, Copy, strum_macros::EnumString)] @@ -255,7 +255,7 @@ impl LayerAccessStats { ret } - pub(super) fn most_recent_access_or_residence_event( + fn most_recent_access_or_residence_event( &self, ) -> Either { let locked = self.0.lock().unwrap(); @@ -268,6 +268,13 @@ impl LayerAccessStats { } } } + + pub(crate) fn latest_activity(&self) -> SystemTime { + match self.most_recent_access_or_residence_event() { + Either::Left(mra) => mra.when, + Either::Right(re) => re.timestamp, + } + } } /// Supertrait of the [`Layer`] trait that captures the bare minimum interface diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e1db34ec1b..b40cb05411 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -13,6 +13,7 @@ use pageserver_api::models::{ DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceStatus, TimelineState, }; +use remote_storage::GenericRemoteStorage; use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -957,6 +958,25 @@ impl Timeline { } } + /// Evict a batch of layers. + /// + /// GenericRemoteStorage reference is required as a witness[^witness_article] for "remote storage is configured." + /// + /// [^witness_article]: https://willcrichton.net/rust-api-type-patterns/witnesses.html + pub async fn evict_layers( + &self, + _: &GenericRemoteStorage, + layers_to_evict: &[Arc], + cancel: CancellationToken, + ) -> anyhow::Result>>> { + let remote_client = self.remote_client.clone().expect( + "GenericRemoteStorage is configured, so timeline must have RemoteTimelineClient", + ); + + self.evict_layer_batch(&remote_client, layers_to_evict, cancel) + .await + } + /// Evict multiple layers at once, continuing through errors. /// /// Try to evict the given `layers_to_evict` by @@ -994,6 +1014,15 @@ impl Timeline { // now lock out layer removal (compaction, gc, timeline deletion) let layer_removal_guard = self.layer_removal_cs.lock().await; + { + // to avoid racing with detach and delete_timeline + let state = self.current_state(); + anyhow::ensure!( + state == TimelineState::Active, + "timeline is not active but {state:?}" + ); + } + // start the batch update let mut layer_map = self.layers.write().unwrap(); let mut batch_updates = layer_map.batch_update(); @@ -1027,6 +1056,8 @@ impl Timeline { use super::layer_map::Replacement; if local_layer.is_remote_layer() { + // TODO(issue #3851): consider returning an err here instead of false, + // which is the same out the match later return Ok(false); } @@ -4012,6 +4043,67 @@ impl Timeline { } } +pub struct DiskUsageEvictionInfo { + /// Timeline's largest layer (remote or resident) + pub max_layer_size: Option, + /// Timeline's resident layers + pub resident_layers: Vec, +} + +pub struct LocalLayerInfoForDiskUsageEviction { + pub layer: Arc, + pub last_activity_ts: SystemTime, +} + +impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it + // having to allocate a string to this is bad, but it will rarely be formatted + let ts = chrono::DateTime::::from(self.last_activity_ts); + let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true); + f.debug_struct("LocalLayerInfoForDiskUsageEviction") + .field("layer", &self.layer) + .field("last_activity", &ts) + .finish() + } +} + +impl LocalLayerInfoForDiskUsageEviction { + pub fn file_size(&self) -> u64 { + self.layer.file_size() + } +} + +impl Timeline { + pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { + let layers = self.layers.read().unwrap(); + + let mut max_layer_size: Option = None; + let mut resident_layers = Vec::new(); + + for l in layers.iter_historic_layers() { + let file_size = l.file_size(); + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + + if l.is_remote_layer() { + continue; + } + + let last_activity_ts = l.access_stats().latest_activity(); + + resident_layers.push(LocalLayerInfoForDiskUsageEviction { + layer: l, + last_activity_ts, + }); + } + + DiskUsageEvictionInfo { + max_layer_size, + resident_layers, + } + } +} + type TraversalPathItem = ( ValueReconstructResult, Lsn, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 107cd89b90..cf799a8808 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -20,7 +20,6 @@ use std::{ time::{Duration, SystemTime}, }; -use either::Either; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; @@ -185,13 +184,7 @@ impl Timeline { if hist_layer.is_remote_layer() { continue; } - let last_activity_ts = match hist_layer - .access_stats() - .most_recent_access_or_residence_event() - { - Either::Left(mra) => mra.when, - Either::Right(re) => re.timestamp, - }; + let last_activity_ts = hist_layer.access_stats().latest_activity(); let no_activity_for = match now.duration_since(last_activity_ts) { Ok(d) => d, Err(_e) => { diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9929d3e66b..a232bf8b6d 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1220,6 +1220,28 @@ class PageserverHttpClient(requests.Session): self.verbose_error(res) return TenantConfig.from_json(res.json()) + def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]): + assert "tenant_id" not in config.keys() + res = self.put( + f"http://localhost:{self.port}/v1/tenant/config", + json={**config, "tenant_id": str(tenant_id)}, + ) + self.verbose_error(res) + + def patch_tenant_config_client_side( + self, + tenant_id: TenantId, + inserts: Optional[Dict[str, Any]] = None, + removes: Optional[List[str]] = None, + ): + current = self.tenant_config(tenant_id).tenant_specific_overrides + if inserts is not None: + current.update(inserts) + if removes is not None: + for key in removes: + del current[key] + self.set_tenant_config(tenant_id, current) + def tenant_size(self, tenant_id: TenantId) -> int: return self.tenant_size_and_modelinputs(tenant_id)[0] @@ -1536,6 +1558,18 @@ class PageserverHttpClient(requests.Session): for layer in info.historic_layers: self.evict_layer(tenant_id, timeline_id, layer.layer_file_name) + def disk_usage_eviction_run(self, request: dict[str, Any]): + res = self.put( + f"http://localhost:{self.port}/v1/disk_usage_eviction/run", + json=request, + ) + self.verbose_error(res) + return res.json() + + def tenant_break(self, tenant_id: TenantId): + res = self.put(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/break") + self.verbose_error(res) + @dataclass class TenantConfig: diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py new file mode 100644 index 0000000000..6ed09734fe --- /dev/null +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -0,0 +1,541 @@ +import shutil +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Tuple + +import pytest +import toml +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + LocalFsStorage, + NeonEnv, + NeonEnvBuilder, + PageserverHttpClient, + PgBin, + RemoteStorageKind, + wait_for_last_flush_lsn, + wait_for_upload_queue_empty, + wait_until, +) +from fixtures.types import Lsn, TenantId, TimelineId + +GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" + + +@pytest.mark.parametrize("config_level_override", [None, 400]) +def test_min_resident_size_override_handling( + neon_env_builder: NeonEnvBuilder, config_level_override: int +): + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + def assert_config(tenant_id, expect_override, expect_effective): + config = ps_http.tenant_config(tenant_id) + assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override + assert config.effective_config.get("min_resident_size_override") == expect_effective + + def assert_overrides(tenant_id, default_tenant_conf_value): + ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200}) + assert_config(tenant_id, 200, 200) + + ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0}) + assert_config(tenant_id, 0, 0) + + ps_http.set_tenant_config(tenant_id, {}) + assert_config(tenant_id, None, default_tenant_conf_value) + + env.pageserver.stop() + if config_level_override is not None: + env.pageserver.start( + overrides=( + "--pageserver-config-override=tenant_config={ min_resident_size_override = " + + str(config_level_override) + + " }", + ) + ) + else: + env.pageserver.start() + + tenant_id, _ = env.neon_cli.create_tenant() + assert_overrides(tenant_id, config_level_override) + + # Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig. + tenant_id, _ = env.neon_cli.create_tenant(conf={"min_resident_size_override": "100"}) + assert_config(tenant_id, 100, 100) + ps_http.set_tenant_config(tenant_id, {}) + assert_config(tenant_id, None, config_level_override) + + +@dataclass +class EvictionEnv: + timelines: list[Tuple[TenantId, TimelineId]] + neon_env: NeonEnv + pg_bin: PgBin + pageserver_http: PageserverHttpClient + layer_size: int + pgbench_init_lsns: Dict[TenantId, Lsn] + + def timelines_du(self) -> Tuple[int, int, int]: + return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid in self.timelines]) + + def du_by_timeline(self) -> Dict[Tuple[TenantId, TimelineId], int]: + return { + (tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0] + for tid, tlid in self.timelines + } + + def warm_up_tenant(self, tenant_id: TenantId): + """ + Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it. + This assumes that the tenant is still at the state after pbench -i. + """ + lsn = self.pgbench_init_lsns[tenant_id] + with self.neon_env.postgres.create_start("main", tenant_id=tenant_id, lsn=lsn) as pg: + self.pg_bin.run(["pgbench", "-S", pg.connstr()]) + + def pageserver_start_with_disk_usage_eviction( + self, period, max_usage_pct, min_avail_bytes, mock_behavior + ): + disk_usage_config = { + "period": period, + "max_usage_pct": max_usage_pct, + "min_avail_bytes": min_avail_bytes, + "mock_statvfs": mock_behavior, + } + + enc = toml.TomlEncoder() + + self.neon_env.pageserver.start( + overrides=( + "--pageserver-config-override=disk_usage_based_eviction=" + + enc.dump_inline_table(disk_usage_config).replace("\n", " "), + ), + ) + + def statvfs_called(): + assert self.neon_env.pageserver.log_contains(".*running mocked statvfs.*") + + wait_until(10, 1, statvfs_called) + + +@pytest.fixture +def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> EvictionEnv: + """ + Creates two tenants, one somewhat larger than the other. + """ + + log.info(f"setting up eviction_env for test {request.node.name}") + + neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}") + + env = neon_env_builder.init_start() + pageserver_http = env.pageserver.http_client() + + # allow because we are invoking this manually; we always warn on executing disk based eviction + env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*") + + # remove the initial tenant + ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865 + assert env.initial_timeline + wait_for_upload_queue_empty(env.pageserver, env.initial_tenant, env.initial_timeline) + pageserver_http.tenant_detach(env.initial_tenant) + assert isinstance(env.remote_storage, LocalFsStorage) + tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant) + assert tenant_remote_storage.is_dir() + shutil.rmtree(tenant_remote_storage) + env.initial_tenant = TenantId("0" * 32) + env.initial_timeline = None + + # Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers. + # Large count of layers and small layer size is good for testing because it makes evictions predictable. + # Predictable in the sense that many layer evictions will be required to reach the eviction target, because + # each eviction only makes small progress. That means little overshoot, and thereby stable asserts. + pgbench_scales = [4, 6] + layer_size = 5 * 1024**2 + + pgbench_init_lsns = {} + + timelines = [] + for scale in pgbench_scales: + tenant_id, timeline_id = env.neon_cli.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{layer_size}", + "image_creation_threshold": "100", + "compaction_target_size": f"{layer_size}", + } + ) + + with env.postgres.create_start("main", tenant_id=tenant_id) as pg: + pg_bin.run(["pgbench", "-i", f"-s{scale}", pg.connstr()]) + wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) + + timelines.append((tenant_id, timeline_id)) + + # stop the safekeepers to avoid on-demand downloads caused by + # initial logical size calculation triggered by walreceiver connection status + # when we restart the pageserver process in any of the tests + env.neon_cli.safekeeper_stop() + + # after stopping the safekeepers, we know that no new WAL will be coming in + for tenant_id, timeline_id in timelines: + pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id) + tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) + assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] + assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] + pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"]) + + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) + log.info(f"{layers}") + assert ( + len(layers.historic_layers) >= 10 + ), "evictions happen at layer granularity, but we often assert at byte-granularity" + + eviction_env = EvictionEnv( + timelines=timelines, + neon_env=env, + pageserver_http=pageserver_http, + layer_size=layer_size, + pg_bin=pg_bin, + pgbench_init_lsns=pgbench_init_lsns, + ) + + return eviction_env + + +def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): + env = eviction_env + + env.neon_env.pageserver.allowed_errors.append( + r".* Changing Active tenant to Broken state, reason: broken from test" + ) + broken_tenant_id, broken_timeline_id = env.timelines[0] + env.pageserver_http.tenant_break(broken_tenant_id) + + healthy_tenant_id, healthy_timeline_id = env.timelines[1] + + broken_size_pre, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) + healthy_size_pre, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + + # try to evict everything, then validate that broken tenant wasn't touched + target = broken_size_pre + healthy_size_pre + + response = env.pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + broken_size_post, _, _ = poor_mans_du(env.neon_env, [(broken_tenant_id, broken_timeline_id)]) + healthy_size_post, _, _ = poor_mans_du(env.neon_env, [(healthy_tenant_id, healthy_timeline_id)]) + + assert broken_size_pre == broken_size_post, "broken tenant should not be touched" + assert healthy_size_post < healthy_size_pre + assert healthy_size_post == 0 + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + + +def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv): + """ + Basic test to ensure that we evict enough to relieve pressure. + """ + env = eviction_env + pageserver_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + + target = total_on_disk // 2 + + response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + + actual_change = total_on_disk - later_total_on_disk + + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "must evict more than half" + assert ( + response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change + ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" + + +def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv): + """ + Override tenant min resident and ensure that it will be respected by eviction. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + du_by_timeline = env.du_by_timeline() + log.info("du_by_timeline: %s", du_by_timeline) + + assert len(du_by_timeline) == 2, "this test assumes two tenants" + large_tenant = max(du_by_timeline, key=du_by_timeline.__getitem__) + small_tenant = min(du_by_timeline, key=du_by_timeline.__getitem__) + assert du_by_timeline[large_tenant] > du_by_timeline[small_tenant] + assert ( + du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size + ), "ensure this test will do more than 1 eviction" + + # Give the larger tenant a haircut while preventing the smaller tenant from getting one. + # To prevent the smaller from getting a haircut, we set min_resident_size to its current size. + # To ensure the larger tenant is getting a haircut, any non-zero `target` will do. + min_resident_size = du_by_timeline[small_tenant] + target = 1 + assert ( + du_by_timeline[large_tenant] > min_resident_size + ), "ensure the larger tenant will get a haircut" + ps_http.patch_tenant_config_client_side( + small_tenant[0], {"min_resident_size_override": min_resident_size} + ) + ps_http.patch_tenant_config_client_side( + large_tenant[0], {"min_resident_size_override": min_resident_size} + ) + + # Make the large tenant more-recently used. An incorrect implemention would try to evict + # the smaller tenant completely first, before turning to the larger tenant, + # since the smaller tenant's layers are least-recently-used. + env.warm_up_tenant(large_tenant[0]) + + # do one run + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + time.sleep(1) # give log time to flush + assert not env.neon_env.pageserver.log_contains( + GLOBAL_LRU_LOG_LINE, + ), "this test is pointless if it fell back to global LRU" + + (later_total_on_disk, _, _) = env.timelines_du() + later_du_by_timeline = env.du_by_timeline() + log.info("later_du_by_timeline: %s", later_du_by_timeline) + + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + assert ( + response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change + ), "report accurately evicted bytes" + assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" + + assert ( + later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant] + ), "small tenant sees no haircut" + assert ( + later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant] + ), "large tenant gets a haircut" + assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target + + +def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): + """ + If we can't relieve pressure using tenant_min_resident_size-respecting eviction, + we should continue to evict layers following global LRU. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + target = total_on_disk + + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + + time.sleep(1) # give log time to flush + assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) + + +def test_partial_evict_tenant(eviction_env: EvictionEnv): + """ + Warm up a tenant, then build up pressure to cause in evictions in both. + We expect + * the default min resident size to be respect (largest layer file size) + * the warmed-up tenants layers above min resident size to be evicted after the cold tenant's. + """ + env = eviction_env + ps_http = env.pageserver_http + + (total_on_disk, _, _) = env.timelines_du() + du_by_timeline = env.du_by_timeline() + + # pick any tenant + [our_tenant, other_tenant] = list(du_by_timeline.keys()) + (tenant_id, timeline_id) = our_tenant + + # make our tenant more recently used than the other one + env.warm_up_tenant(tenant_id) + + # Build up enough pressure to require evictions from both tenants, + # but not enough to fall into global LRU. + # So, set target to all occipied space, except 2*env.layer_size per tenant + target = ( + du_by_timeline[other_tenant] + (du_by_timeline[our_tenant] // 2) - 2 * 2 * env.layer_size + ) + response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + log.info(f"{response}") + + (later_total_on_disk, _, _) = env.timelines_du() + actual_change = total_on_disk - later_total_on_disk + assert 0 <= actual_change, "nothing can load layers during this test" + assert actual_change >= target, "eviction must always evict more than target" + + later_du_by_timeline = env.du_by_timeline() + for tenant, later_tenant_usage in later_du_by_timeline.items(): + assert ( + later_tenant_usage < du_by_timeline[tenant] + ), "all tenants should have lost some layers" + + assert ( + later_du_by_timeline[our_tenant] > 0.5 * du_by_timeline[our_tenant] + ), "our warmed up tenant should be at about half capacity, part 1" + assert ( + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + later_du_by_timeline[our_tenant] + < 0.5 * du_by_timeline[our_tenant] + 3 * env.layer_size + ), "our warmed up tenant should be at about half capacity, part 2" + assert ( + later_du_by_timeline[other_tenant] < 2 * env.layer_size + ), "the other tenant should be evicted to is min_resident_size, i.e., max layer file size" + + +def poor_mans_du( + env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]] +) -> Tuple[int, int, int]: + """ + Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples; + this could be done over layers endpoint just as well. + """ + total_on_disk = 0 + largest_layer = 0 + smallest_layer = None + for tenant_id, timeline_id in timelines: + dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id) + assert dir.exists(), f"timeline dir does not exist: {dir}" + sum = 0 + for file in dir.iterdir(): + if "__" not in file.name: + continue + size = file.stat().st_size + sum += size + largest_layer = max(largest_layer, size) + if smallest_layer: + smallest_layer = min(smallest_layer, size) + else: + smallest_layer = size + log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}") + + log.info(f"{tenant_id}/{timeline_id}: sum {sum}") + total_on_disk += sum + + assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0 + return (total_on_disk, largest_layer, smallest_layer or 0) + + +def test_statvfs_error_handling(eviction_env: EvictionEnv): + """ + We should log an error that statvfs fails. + """ + env = eviction_env + env.neon_env.pageserver.stop() + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=90, + min_avail_bytes=0, + mock_behavior={ + "type": "Failure", + "mocked_error": "EIO", + }, + ) + + assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") + env.neon_env.pageserver.allowed_errors.append(".*statvfs failed.*EIO") + + +def test_statvfs_pressure_usage(eviction_env: EvictionEnv): + """ + If statvfs data shows 100% usage, the eviction task will drive it down to + the configured max_usage_pct. + """ + env = eviction_env + + env.neon_env.pageserver.stop() + + # make it seem like we're at 100% utilization by setting total bytes to the used bytes + total_size, _, _ = env.timelines_du() + blocksize = 512 + total_blocks = (total_size + (blocksize - 1)) // blocksize + + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=33, + min_avail_bytes=0, + mock_behavior={ + "type": "Success", + "blocksize": blocksize, + "total_blocks": total_blocks, + # Only count layer files towards used bytes in the mock_statvfs. + # This avoids accounting for metadata files & tenant conf in the tests. + "name_filter": ".*__.*", + }, + ) + + def relieved_log_message(): + assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + + wait_until(10, 1, relieved_log_message) + + post_eviction_total_size, _, _ = env.timelines_du() + + assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage" + + +def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): + """ + If statvfs data shows 100% usage, the eviction task will drive it down to + at least the configured min_avail_bytes. + """ + env = eviction_env + + env.neon_env.pageserver.stop() + + # make it seem like we're at 100% utilization by setting total bytes to the used bytes + total_size, _, _ = env.timelines_du() + blocksize = 512 + total_blocks = (total_size + (blocksize - 1)) // blocksize + + min_avail_bytes = total_size // 3 + + env.pageserver_start_with_disk_usage_eviction( + period="1s", + max_usage_pct=100, + min_avail_bytes=min_avail_bytes, + mock_behavior={ + "type": "Success", + "blocksize": blocksize, + "total_blocks": total_blocks, + # Only count layer files towards used bytes in the mock_statvfs. + # This avoids accounting for metadata files & tenant conf in the tests. + "name_filter": ".*__.*", + }, + ) + + def relieved_log_message(): + assert env.neon_env.pageserver.log_contains(".*disk usage pressure relieved") + + wait_until(10, 1, relieved_log_message) + + post_eviction_total_size, _, _ = env.timelines_du() + + assert ( + total_size - post_eviction_total_size >= min_avail_bytes + ), "we requested at least min_avail_bytes worth of free space" From 271f6a6e99a46667486d7a4450abf75a6b8120c2 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Tue, 28 Mar 2023 11:58:51 +0400 Subject: [PATCH 16/25] Always sync-safekeepers in neon_local on compute start. Instead of checking neon.safekeepers GUC value in existing pg node data dir, just always run sync-safekeepers when safekeepers are configured. Without this change, creation of new compute didn't run it. That's ok for new timeline/branch (it doesn't return anything useful anyway, and LSN is known by pageserver), but restart of compute for existing timeline bore the risk of getting basebackup not on the latest LSN, i.e. basically broken -- it might not have prev_lsn, and even if it had, walproposer would complain anyway. fixes https://github.com/neondatabase/neon/issues/2963 --- control_plane/src/compute.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index ee504bfaa6..bc81107706 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -90,7 +90,6 @@ impl ComputeControlPlane { timeline_id, lsn, tenant_id, - uses_wal_proposer: false, pg_version, }); @@ -115,7 +114,6 @@ pub struct PostgresNode { pub timeline_id: TimelineId, pub lsn: Option, // if it's a read-only node. None for primary pub tenant_id: TenantId, - uses_wal_proposer: bool, pg_version: u32, } @@ -149,7 +147,6 @@ impl PostgresNode { let port: u16 = conf.parse_field("port", &context)?; let timeline_id: TimelineId = conf.parse_field("neon.timeline_id", &context)?; let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?; - let uses_wal_proposer = conf.get("neon.safekeepers").is_some(); // Read postgres version from PG_VERSION file to determine which postgres version binary to use. // If it doesn't exist, assume broken data directory and use default pg version. @@ -172,7 +169,6 @@ impl PostgresNode { timeline_id, lsn: recovery_target_lsn, tenant_id, - uses_wal_proposer, pg_version, }) } @@ -364,7 +360,7 @@ impl PostgresNode { fn load_basebackup(&self, auth_token: &Option) -> Result<()> { let backup_lsn = if let Some(lsn) = self.lsn { Some(lsn) - } else if self.uses_wal_proposer { + } else if !self.env.safekeepers.is_empty() { // LSN 0 means that it is bootstrap and we need to download just // latest data from the pageserver. That is a bit clumsy but whole bootstrap // procedure evolves quite actively right now, so let's think about it again From d0711d08966542147596719ecec5eaf53c1f1b44 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 31 Mar 2023 16:05:15 +0300 Subject: [PATCH 17/25] build: fix git perms for deploy job (#3921) copy pasted from `build-neon` job. it is interesting that this is only needed by `build-neon` and `deploy`. Fixes: https://github.com/neondatabase/neon/actions/runs/4568077915/jobs/8070960178 which seems to have been going for a while. --- .github/workflows/build_and_test.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 52e1d94e9b..8482341b0c 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -898,6 +898,16 @@ jobs: needs: [ push-docker-hub, tag, regress-tests ] if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch' steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Checkout uses: actions/checkout@v3 with: From 22f9ea5fe25b9a0782bc8c6ddca6ebdba89c8d73 Mon Sep 17 00:00:00 2001 From: Dmitry Rodionov Date: Fri, 31 Mar 2023 16:11:34 +0300 Subject: [PATCH 18/25] Remind people to clean up merge commit message in PR template (#3920) --- .github/pull_request_template.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 3f32b80ca8..816c5ee711 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -3,8 +3,12 @@ ## Issue ticket number and link ## Checklist before requesting a review + - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. +## Checklist before merging + +- [ ] Do not forget to reformat commit message to not include the above checklist From d2aa31f0ce687c3b2bc82c5f3db67dccbd5083cf Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Fri, 31 Mar 2023 18:25:53 +0200 Subject: [PATCH 19/25] fix pageserver_evictions_with_low_residence_duration metric (#3925) It was doing the comparison in the wrong way. --- pageserver/src/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 6cb245aed7..1f31e5a8fb 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -257,7 +257,7 @@ impl EvictionsWithLowResidenceDuration { } pub fn observe(&self, observed_value: Duration) { - if self.threshold < observed_value { + if observed_value < self.threshold { self.counter .as_ref() .expect("nobody calls this function after `remove_from_vec`") From 75ffe34b1734c1415b6cc6c998bc8c099409ad58 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 31 Mar 2023 19:45:59 +0100 Subject: [PATCH 20/25] check-macos-build: fix cache key (#3926) We don't have `${{ matrix.build_type }}` there, so it gets resolved to an empty substring and looks like this [`v1-macOS--pg-f8a650e49b06d39ad131b860117504044b01f312-dcccd010ff851b9f72bb451f28243fa3a341f07028034bbb46ea802413b36d80`](https://github.com/neondatabase/neon/actions/runs/4575422427/jobs/8078231907#step:26:2) --- .github/workflows/neon_extra_builds.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 2ae517e5e7..ef4c293e31 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,14 +53,14 @@ jobs: uses: actions/cache@v3 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v3 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} + key: v1-${{ runner.os }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Set extra env for macOS run: | From 814abd9f844561e4201f2b1c781867381f388a8a Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Sun, 2 Apr 2023 11:32:27 +0300 Subject: [PATCH 21/25] Switch to safekeeper in the same AZ (#3883) Add a condition to switch walreceiver connection to safekeeper that is located in the same availability zone. Switch happens when commit_lsn of a candidate is not less than commit_lsn from the active connection. This condition is expected not to trigger instantly, because commit_lsn of a current connection is usually greater than commit_lsn of updates from the broker. That means that if WAL is written continuously, switch can take a lot of time, but it should happen eventually. Now protoc 3.15+ is required for building neon. Fixes https://github.com/neondatabase/neon/issues/3200 --- README.md | 2 + .../walreceiver/connection_manager.rs | 123 +++++++++++++++--- safekeeper/src/http/routes.rs | 1 + safekeeper/src/timeline.rs | 1 + storage_broker/benches/rps.rs | 1 + storage_broker/proto/broker.proto | 4 +- storage_broker/src/bin/storage_broker.rs | 1 + 7 files changed, 117 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 43f3e3a02b..55df67f6c7 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ pacman -S base-devel readline zlib libseccomp openssl clang \ postgresql-libs cmake postgresql protobuf ``` +Building Neon requires 3.15+ version of `protoc` (protobuf-compiler). If your distribution provides an older version, you can install a newer version from [here](https://github.com/protocolbuffers/protobuf/releases). + 2. [Install Rust](https://www.rust-lang.org/tools/install) ``` # recommended approach from https://www.rust-lang.org/tools/install diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 0c770136db..de07676ffe 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -237,11 +237,7 @@ async fn connection_manager_loop_step( if let Some(new_candidate) = walreceiver_state.next_connection_candidate() { info!("Switching to new connection candidate: {new_candidate:?}"); walreceiver_state - .change_connection( - new_candidate.safekeeper_id, - new_candidate.wal_source_connconf, - ctx, - ) + .change_connection(new_candidate, ctx) .await } } @@ -346,6 +342,8 @@ struct WalConnection { started_at: NaiveDateTime, /// Current safekeeper pageserver is connected to for WAL streaming. sk_id: NodeId, + /// Availability zone of the safekeeper. + availability_zone: Option, /// Status of the connection. status: WalConnectionStatus, /// WAL streaming task handle. @@ -405,12 +403,7 @@ impl WalreceiverState { } /// Shuts down the current connection (if any) and immediately starts another one with the given connection string. - async fn change_connection( - &mut self, - new_sk_id: NodeId, - new_wal_source_connconf: PgConnectionConfig, - ctx: &RequestContext, - ) { + async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) { self.drop_old_connection(true).await; let id = self.id; @@ -424,7 +417,7 @@ impl WalreceiverState { async move { super::walreceiver_connection::handle_walreceiver_connection( timeline, - new_wal_source_connconf, + new_sk.wal_source_connconf, events_sender, cancellation, connect_timeout, @@ -433,13 +426,16 @@ impl WalreceiverState { .await .context("walreceiver connection handling failure") } - .instrument(info_span!("walreceiver_connection", id = %id, node_id = %new_sk_id)) + .instrument( + info_span!("walreceiver_connection", id = %id, node_id = %new_sk.safekeeper_id), + ) }); let now = Utc::now().naive_utc(); self.wal_connection = Some(WalConnection { started_at: now, - sk_id: new_sk_id, + sk_id: new_sk.safekeeper_id, + availability_zone: new_sk.availability_zone, status: WalConnectionStatus { is_connected: false, has_processed_wal: false, @@ -546,6 +542,7 @@ impl WalreceiverState { /// * if connected safekeeper is not present, pick the candidate /// * if we haven't received any updates for some time, pick the candidate /// * if the candidate commit_lsn is much higher than the current one, pick the candidate + /// * if the candidate commit_lsn is same, but candidate is located in the same AZ as the pageserver, pick the candidate /// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate /// /// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently. @@ -559,6 +556,7 @@ impl WalreceiverState { let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(Some(connected_sk_node))?; + let new_availability_zone = new_safekeeper_broker_data.availability_zone.clone(); let now = Utc::now().naive_utc(); if let Ok(latest_interaciton) = @@ -569,6 +567,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoKeepAlives { last_keep_alive: Some( existing_wal_connection.status.latest_connection_update, @@ -594,6 +593,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::LaggingWal { current_commit_lsn, new_commit_lsn, @@ -601,6 +601,20 @@ impl WalreceiverState { }, }); } + // If we have a candidate with the same commit_lsn as the current one, which is in the same AZ as pageserver, + // and the current one is not, switch to the new one. + if self.availability_zone.is_some() + && existing_wal_connection.availability_zone + != self.availability_zone + && self.availability_zone == new_availability_zone + { + return Some(NewWalConnectionCandidate { + safekeeper_id: new_sk_id, + availability_zone: new_availability_zone, + wal_source_connconf: new_wal_source_connconf, + reason: ReconnectReason::SwitchAvailabilityZone, + }); + } } None => debug!( "Best SK candidate has its commit_lsn behind connected SK's commit_lsn" @@ -668,6 +682,7 @@ impl WalreceiverState { return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, wal_source_connconf: new_wal_source_connconf, + availability_zone: new_availability_zone, reason: ReconnectReason::NoWalTimeout { current_lsn, current_commit_lsn, @@ -686,10 +701,11 @@ impl WalreceiverState { self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal; } None => { - let (new_sk_id, _, new_wal_source_connconf) = + let (new_sk_id, new_safekeeper_broker_data, new_wal_source_connconf) = self.select_connection_candidate(None)?; return Some(NewWalConnectionCandidate { safekeeper_id: new_sk_id, + availability_zone: new_safekeeper_broker_data.availability_zone.clone(), wal_source_connconf: new_wal_source_connconf, reason: ReconnectReason::NoExistingConnection, }); @@ -794,6 +810,7 @@ impl WalreceiverState { struct NewWalConnectionCandidate { safekeeper_id: NodeId, wal_source_connconf: PgConnectionConfig, + availability_zone: Option, // This field is used in `derive(Debug)` only. #[allow(dead_code)] reason: ReconnectReason, @@ -808,6 +825,7 @@ enum ReconnectReason { new_commit_lsn: Lsn, threshold: NonZeroU64, }, + SwitchAvailabilityZone, NoWalTimeout { current_lsn: Lsn, current_commit_lsn: Lsn, @@ -873,6 +891,7 @@ mod tests { peer_horizon_lsn: 0, local_start_lsn: 0, safekeeper_connstr: safekeeper_connstr.to_owned(), + availability_zone: None, }, latest_update, } @@ -933,6 +952,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1095,6 +1115,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: connected_sk_id, + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1160,6 +1181,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |sender, _| async move { sender @@ -1222,6 +1244,7 @@ mod tests { state.wal_connection = Some(WalConnection { started_at: now, sk_id: NodeId(1), + availability_zone: None, status: connection_status, connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }), discovered_new_wal: Some(NewCommittedWAL { @@ -1289,4 +1312,74 @@ mod tests { availability_zone: None, } } + + #[tokio::test] + async fn switch_to_same_availability_zone() -> anyhow::Result<()> { + // Pageserver and one of safekeepers will be in the same availability zone + // and pageserver should prefer to connect to it. + let test_az = Some("test_az".to_owned()); + + let harness = TenantHarness::create("switch_to_same_availability_zone")?; + let mut state = dummy_state(&harness).await; + state.availability_zone = test_az.clone(); + let current_lsn = Lsn(100_000).align(); + let now = Utc::now().naive_utc(); + + let connected_sk_id = NodeId(0); + + let connection_status = WalConnectionStatus { + is_connected: true, + has_processed_wal: true, + latest_connection_update: now, + latest_wal_update: now, + commit_lsn: Some(current_lsn), + streaming_lsn: Some(current_lsn), + }; + + state.wal_connection = Some(WalConnection { + started_at: now, + sk_id: connected_sk_id, + availability_zone: None, + status: connection_status, + connection_task: TaskHandle::spawn(move |sender, _| async move { + sender + .send(TaskStateUpdate::Progress(connection_status)) + .ok(); + Ok(()) + }), + discovered_new_wal: None, + }); + + // We have another safekeeper with the same commit_lsn, and it have the same availability zone as + // the current pageserver. + let mut same_az_sk = dummy_broker_sk_timeline(current_lsn.0, "same_az", now); + same_az_sk.timeline.availability_zone = test_az.clone(); + + state.wal_stream_candidates = HashMap::from([ + ( + connected_sk_id, + dummy_broker_sk_timeline(current_lsn.0, DUMMY_SAFEKEEPER_HOST, now), + ), + (NodeId(1), same_az_sk), + ]); + + // We expect that pageserver will switch to the safekeeper in the same availability zone, + // even if it has the same commit_lsn. + let next_candidate = state.next_connection_candidate().expect( + "Expected one candidate selected out of multiple valid data options, but got none", + ); + + assert_eq!(next_candidate.safekeeper_id, NodeId(1)); + assert_eq!( + next_candidate.reason, + ReconnectReason::SwitchAvailabilityZone, + "Should switch to the safekeeper in the same availability zone, if it has the same commit_lsn" + ); + assert_eq!( + next_candidate.wal_source_connconf.host(), + &Host::Domain("same_az".to_owned()) + ); + + Ok(()) + } } diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 14badebd95..cdec45c148 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -242,6 +242,7 @@ async fn record_safekeeper_info(mut request: Request) -> Result, n_keys: u64) { peer_horizon_lsn: 5, safekeeper_connstr: "zenith-1-sk-1.local:7676".to_owned(), local_start_lsn: 0, + availability_zone: None, }; counter += 1; yield info; diff --git a/storage_broker/proto/broker.proto b/storage_broker/proto/broker.proto index 1a46896d02..4b2de1a8e5 100644 --- a/storage_broker/proto/broker.proto +++ b/storage_broker/proto/broker.proto @@ -36,9 +36,11 @@ message SafekeeperTimelineInfo { uint64 local_start_lsn = 9; // A connection string to use for WAL receiving. string safekeeper_connstr = 10; + // Availability zone of a safekeeper. + optional string availability_zone = 11; } message TenantTimelineId { bytes tenant_id = 1; bytes timeline_id = 2; -} \ No newline at end of file +} diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 57f975b0df..d7ace28426 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -525,6 +525,7 @@ mod tests { peer_horizon_lsn: 5, safekeeper_connstr: "neon-1-sk-1.local:7676".to_owned(), local_start_lsn: 0, + availability_zone: None, } } From d733bc54b8b2aa904cf0192359c0c7d6f986fe8d Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Wed, 29 Mar 2023 22:02:36 +0400 Subject: [PATCH 22/25] Rename ReplicationFeedback and its fields. This is the the feedback originating from pageserver, so change previous confusing names to s/ReplicationFeedback/PageserverFeedback s/ps_writelsn/last_receive_lsn s/ps_flushlsn/disk_consistent_lsn s/ps_apply_lsn/remote_consistent_lsn I haven't changed on the wire format to keep compatibility. However, understanding of new field names is added to compute, so once all computes receive this patch we can change the wire names as well. Safekeepers/pageservers are deployed roughly at the same time and it is ok to live without feedbacks during the short period, so this is not a problem there. --- libs/pq_proto/src/lib.rs | 90 ++++++++++-------- .../walreceiver/walreceiver_connection.rs | 18 ++-- pgxn/neon/walproposer.c | 94 +++++++++---------- pgxn/neon/walproposer.h | 24 ++--- safekeeper/src/metrics.rs | 25 +++-- safekeeper/src/safekeeper.rs | 8 +- safekeeper/src/send_wal.rs | 8 +- safekeeper/src/timeline.rs | 12 +-- 8 files changed, 142 insertions(+), 137 deletions(-) diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 656c0ff312..a976e19029 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -936,35 +936,40 @@ impl<'a> BeMessage<'a> { } } -// Neon extension of postgres replication protocol -// See NEON_STATUS_UPDATE_TAG_BYTE +/// Feedback pageserver sends to safekeeper and safekeeper resends to compute. +/// Serialized in custom flexible key/value format. In replication protocol, it +/// is marked with NEON_STATUS_UPDATE_TAG_BYTE to differentiate from postgres +/// Standby status update / Hot standby feedback messages. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub struct ReplicationFeedback { - // Last known size of the timeline. Used to enforce timeline size limit. +pub struct PageserverFeedback { + /// Last known size of the timeline. Used to enforce timeline size limit. pub current_timeline_size: u64, - // Parts of StandbyStatusUpdate we resend to compute via safekeeper - pub ps_writelsn: u64, - pub ps_applylsn: u64, - pub ps_flushlsn: u64, - pub ps_replytime: SystemTime, + /// LSN last received and ingested by the pageserver. + pub last_received_lsn: u64, + /// LSN up to which data is persisted by the pageserver to its local disc. + pub disk_consistent_lsn: u64, + /// LSN up to which data is persisted by the pageserver on s3; safekeepers + /// consider WAL before it can be removed. + pub remote_consistent_lsn: u64, + pub replytime: SystemTime, } -// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback. +// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback. // Do not remove previously available fields because this might be backwards incompatible. -pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5; +pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5; -impl ReplicationFeedback { - pub fn empty() -> ReplicationFeedback { - ReplicationFeedback { +impl PageserverFeedback { + pub fn empty() -> PageserverFeedback { + PageserverFeedback { current_timeline_size: 0, - ps_writelsn: 0, - ps_applylsn: 0, - ps_flushlsn: 0, - ps_replytime: SystemTime::now(), + last_received_lsn: 0, + remote_consistent_lsn: 0, + disk_consistent_lsn: 0, + replytime: SystemTime::now(), } } - // Serialize ReplicationFeedback using custom format + // Serialize PageserverFeedback using custom format // to support protocol extensibility. // // Following layout is used: @@ -974,24 +979,26 @@ impl ReplicationFeedback { // null-terminated string - key, // uint32 - value length in bytes // value itself + // + // TODO: change serialized fields names once all computes migrate to rename. pub fn serialize(&self, buf: &mut BytesMut) { - buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys + buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys buf.put_slice(b"current_timeline_size\0"); buf.put_i32(8); buf.put_u64(self.current_timeline_size); buf.put_slice(b"ps_writelsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_writelsn); + buf.put_u64(self.last_received_lsn); buf.put_slice(b"ps_flushlsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_flushlsn); + buf.put_u64(self.disk_consistent_lsn); buf.put_slice(b"ps_applylsn\0"); buf.put_i32(8); - buf.put_u64(self.ps_applylsn); + buf.put_u64(self.remote_consistent_lsn); let timestamp = self - .ps_replytime + .replytime .duration_since(*PG_EPOCH) .expect("failed to serialize pg_replytime earlier than PG_EPOCH") .as_micros() as i64; @@ -1001,9 +1008,10 @@ impl ReplicationFeedback { buf.put_i64(timestamp); } - // Deserialize ReplicationFeedback message - pub fn parse(mut buf: Bytes) -> ReplicationFeedback { - let mut rf = ReplicationFeedback::empty(); + // Deserialize PageserverFeedback message + // TODO: change serialized fields names once all computes migrate to rename. + pub fn parse(mut buf: Bytes) -> PageserverFeedback { + let mut rf = PageserverFeedback::empty(); let nfields = buf.get_u8(); for _ in 0..nfields { let key = read_cstr(&mut buf).unwrap(); @@ -1016,39 +1024,39 @@ impl ReplicationFeedback { b"ps_writelsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_writelsn = buf.get_u64(); + rf.last_received_lsn = buf.get_u64(); } b"ps_flushlsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_flushlsn = buf.get_u64(); + rf.disk_consistent_lsn = buf.get_u64(); } b"ps_applylsn" => { let len = buf.get_i32(); assert_eq!(len, 8); - rf.ps_applylsn = buf.get_u64(); + rf.remote_consistent_lsn = buf.get_u64(); } b"ps_replytime" => { let len = buf.get_i32(); assert_eq!(len, 8); let raw_time = buf.get_i64(); if raw_time > 0 { - rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); + rf.replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64); } else { - rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); + rf.replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64); } } _ => { let len = buf.get_i32(); warn!( - "ReplicationFeedback parse. unknown key {} of len {len}. Skip it.", + "PageserverFeedback parse. unknown key {} of len {len}. Skip it.", String::from_utf8_lossy(key.as_ref()) ); buf.advance(len as usize); } } } - trace!("ReplicationFeedback parsed is {:?}", rf); + trace!("PageserverFeedback parsed is {:?}", rf); rf } } @@ -1059,33 +1067,33 @@ mod tests { #[test] fn test_replication_feedback_serialization() { - let mut rf = ReplicationFeedback::empty(); + let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); - let rf_parsed = ReplicationFeedback::parse(data.freeze()); + let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } #[test] fn test_replication_feedback_unknown_key() { - let mut rf = ReplicationFeedback::empty(); + let mut rf = PageserverFeedback::empty(); // Fill rf with some values rf.current_timeline_size = 12345678; // Set rounded time to be able to compare it with deserialized value, // because it is rounded up to microseconds during serialization. - rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000); + rf.replytime = *PG_EPOCH + Duration::from_secs(100_000_000); let mut data = BytesMut::new(); rf.serialize(&mut data); // Add an extra field to the buffer and adjust number of keys if let Some(first) = data.first_mut() { - *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1; + *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1; } data.put_slice(b"new_field_one\0"); @@ -1093,7 +1101,7 @@ mod tests { data.put_u64(42); // Parse serialized data and check that new field is not parsed - let rf_parsed = ReplicationFeedback::parse(data.freeze()); + let rf_parsed = PageserverFeedback::parse(data.freeze()); assert_eq!(rf, rf_parsed); } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 9398a7bee9..ea2f2392ea 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -37,7 +37,7 @@ use crate::{ use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; -use pq_proto::ReplicationFeedback; +use pq_proto::PageserverFeedback; use utils::lsn::Lsn; /// Status of the connection. @@ -319,12 +319,12 @@ pub async fn handle_walreceiver_connection( timeline.get_remote_consistent_lsn().unwrap_or(Lsn(0)); // The last LSN we processed. It is not guaranteed to survive pageserver crash. - let write_lsn = u64::from(last_lsn); + let last_received_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); + let disk_consistent_lsn = u64::from(timeline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. - let apply_lsn = u64::from(timeline_remote_consistent_lsn); + let remote_consistent_lsn = u64::from(timeline_remote_consistent_lsn); let ts = SystemTime::now(); // Update the status about what we just received. This is shown in the mgmt API. @@ -343,12 +343,12 @@ pub async fn handle_walreceiver_connection( let (timeline_logical_size, _) = timeline .get_current_logical_size(&ctx) .context("Status update creation failed to get current logical size")?; - let status_update = ReplicationFeedback { + let status_update = PageserverFeedback { current_timeline_size: timeline_logical_size, - ps_writelsn: write_lsn, - ps_flushlsn: flush_lsn, - ps_applylsn: apply_lsn, - ps_replytime: ts, + last_received_lsn, + disk_consistent_lsn, + remote_consistent_lsn, + replytime: ts, }; debug!("neon_status_update {status_update:?}"); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index b0b2a23e3c..45037a8c01 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -1872,9 +1872,9 @@ RecvAppendResponses(Safekeeper *sk) return sk->state == SS_ACTIVE; } -/* Parse a ReplicationFeedback message, or the ReplicationFeedback part of an AppendResponse */ +/* Parse a PageserverFeedback message, or the PageserverFeedback part of an AppendResponse */ void -ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * rf) +ParsePageserverFeedbackMessage(StringInfo reply_message, PageserverFeedback * rf) { uint8 nkeys; int i; @@ -1892,45 +1892,45 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ rf->currentClusterSize = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: current_timeline_size %lu", + elog(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu", rf->currentClusterSize); } - else if (strcmp(key, "ps_writelsn") == 0) + else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_writelsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_writelsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_writelsn)); + rf->last_received_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X", + LSN_FORMAT_ARGS(rf->last_received_lsn)); } - else if (strcmp(key, "ps_flushlsn") == 0) + else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_flushlsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_flushlsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_flushlsn)); + rf->disk_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->disk_consistent_lsn)); } - else if (strcmp(key, "ps_applylsn") == 0) + else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_applylsn = pq_getmsgint64(reply_message); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_applylsn %X/%X", - LSN_FORMAT_ARGS(rf->ps_applylsn)); + rf->remote_consistent_lsn = pq_getmsgint64(reply_message); + elog(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X", + LSN_FORMAT_ARGS(rf->remote_consistent_lsn)); } - else if (strcmp(key, "ps_replytime") == 0) + else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0)) { pq_getmsgint(reply_message, sizeof(int32)); /* read value length */ - rf->ps_replytime = pq_getmsgint64(reply_message); + rf->replytime = pq_getmsgint64(reply_message); { char *replyTimeStr; /* Copy because timestamptz_to_str returns a static buffer */ - replyTimeStr = pstrdup(timestamptz_to_str(rf->ps_replytime)); - elog(DEBUG2, "ParseReplicationFeedbackMessage: ps_replytime %lu reply_time: %s", - rf->ps_replytime, replyTimeStr); + replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime)); + elog(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s", + rf->replytime, replyTimeStr); pfree(replyTimeStr); } @@ -1944,7 +1944,7 @@ ParseReplicationFeedbackMessage(StringInfo reply_message, ReplicationFeedback * * Skip unknown keys to support backward compatibile protocol * changes */ - elog(LOG, "ParseReplicationFeedbackMessage: unknown key: %s len %d", key, len); + elog(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len); pq_getmsgbytes(reply_message, len); }; } @@ -2024,7 +2024,7 @@ GetAcknowledgedByQuorumWALPosition(void) } /* - * ReplicationFeedbackShmemSize --- report amount of shared memory space needed + * WalproposerShmemSize --- report amount of shared memory space needed */ Size WalproposerShmemSize(void) @@ -2054,10 +2054,10 @@ WalproposerShmemInit(void) } void -replication_feedback_set(ReplicationFeedback * rf) +replication_feedback_set(PageserverFeedback * rf) { SpinLockAcquire(&walprop_shared->mutex); - memcpy(&walprop_shared->feedback, rf, sizeof(ReplicationFeedback)); + memcpy(&walprop_shared->feedback, rf, sizeof(PageserverFeedback)); SpinLockRelease(&walprop_shared->mutex); } @@ -2065,43 +2065,43 @@ void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn) { SpinLockAcquire(&walprop_shared->mutex); - *writeLsn = walprop_shared->feedback.ps_writelsn; - *flushLsn = walprop_shared->feedback.ps_flushlsn; - *applyLsn = walprop_shared->feedback.ps_applylsn; + *writeLsn = walprop_shared->feedback.last_received_lsn; + *flushLsn = walprop_shared->feedback.disk_consistent_lsn; + *applyLsn = walprop_shared->feedback.remote_consistent_lsn; SpinLockRelease(&walprop_shared->mutex); } /* - * Get ReplicationFeedback fields from the most advanced safekeeper + * Get PageserverFeedback fields from the most advanced safekeeper */ static void -GetLatestNeonFeedback(ReplicationFeedback * rf) +GetLatestNeonFeedback(PageserverFeedback * rf) { int latest_safekeeper = 0; - XLogRecPtr ps_writelsn = InvalidXLogRecPtr; + XLogRecPtr last_received_lsn = InvalidXLogRecPtr; for (int i = 0; i < n_safekeepers; i++) { - if (safekeeper[i].appendResponse.rf.ps_writelsn > ps_writelsn) + if (safekeeper[i].appendResponse.rf.last_received_lsn > last_received_lsn) { latest_safekeeper = i; - ps_writelsn = safekeeper[i].appendResponse.rf.ps_writelsn; + last_received_lsn = safekeeper[i].appendResponse.rf.last_received_lsn; } } rf->currentClusterSize = safekeeper[latest_safekeeper].appendResponse.rf.currentClusterSize; - rf->ps_writelsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_writelsn; - rf->ps_flushlsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_flushlsn; - rf->ps_applylsn = safekeeper[latest_safekeeper].appendResponse.rf.ps_applylsn; - rf->ps_replytime = safekeeper[latest_safekeeper].appendResponse.rf.ps_replytime; + rf->last_received_lsn = safekeeper[latest_safekeeper].appendResponse.rf.last_received_lsn; + rf->disk_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.disk_consistent_lsn; + rf->remote_consistent_lsn = safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn; + rf->replytime = safekeeper[latest_safekeeper].appendResponse.rf.replytime; elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu," - " ps_writelsn %X/%X, ps_flushlsn %X/%X, ps_applylsn %X/%X, ps_replytime %lu", + " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu", rf->currentClusterSize, - LSN_FORMAT_ARGS(rf->ps_writelsn), - LSN_FORMAT_ARGS(rf->ps_flushlsn), - LSN_FORMAT_ARGS(rf->ps_applylsn), - rf->ps_replytime); + LSN_FORMAT_ARGS(rf->last_received_lsn), + LSN_FORMAT_ARGS(rf->disk_consistent_lsn), + LSN_FORMAT_ARGS(rf->remote_consistent_lsn), + rf->replytime); replication_feedback_set(rf); } @@ -2115,16 +2115,16 @@ HandleSafekeeperResponse(void) XLogRecPtr minFlushLsn; minQuorumLsn = GetAcknowledgedByQuorumWALPosition(); - diskConsistentLsn = quorumFeedback.rf.ps_flushlsn; + diskConsistentLsn = quorumFeedback.rf.disk_consistent_lsn; if (!syncSafekeepers) { - /* Get ReplicationFeedback fields from the most advanced safekeeper */ + /* Get PageserverFeedback fields from the most advanced safekeeper */ GetLatestNeonFeedback(&quorumFeedback.rf); SetZenithCurrentClusterSize(quorumFeedback.rf.currentClusterSize); } - if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.ps_flushlsn) + if (minQuorumLsn > quorumFeedback.flushLsn || diskConsistentLsn != quorumFeedback.rf.disk_consistent_lsn) { if (minQuorumLsn > quorumFeedback.flushLsn) @@ -2142,7 +2142,7 @@ HandleSafekeeperResponse(void) * apply_lsn - This is what processed and durably saved at* * pageserver. */ - quorumFeedback.rf.ps_flushlsn, + quorumFeedback.rf.disk_consistent_lsn, GetCurrentTimestamp(), false); } @@ -2326,7 +2326,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg) msg->hs.xmin.value = pq_getmsgint64_le(&s); msg->hs.catalog_xmin.value = pq_getmsgint64_le(&s); if (buf_size > APPENDRESPONSE_FIXEDPART_SIZE) - ParseReplicationFeedbackMessage(&s, &msg->rf); + ParsePageserverFeedbackMessage(&s, &msg->rf); pq_getmsgend(&s); return true; } @@ -2462,7 +2462,7 @@ backpressure_lag_impl(void) replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr); #define MB ((XLogRecPtr)1024 * 1024) - elog(DEBUG2, "current flushLsn %X/%X ReplicationFeedback: write %X/%X flush %X/%X apply %X/%X", + elog(DEBUG2, "current flushLsn %X/%X PageserverFeedback: write %X/%X flush %X/%X apply %X/%X", LSN_FORMAT_ARGS(myFlushLsn), LSN_FORMAT_ARGS(writePtr), LSN_FORMAT_ARGS(flushPtr), diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index 537c733850..f016a229eb 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -280,21 +280,21 @@ typedef struct HotStandbyFeedback FullTransactionId catalog_xmin; } HotStandbyFeedback; -typedef struct ReplicationFeedback +typedef struct PageserverFeedback { /* current size of the timeline on pageserver */ uint64 currentClusterSize; /* standby_status_update fields that safekeeper received from pageserver */ - XLogRecPtr ps_writelsn; - XLogRecPtr ps_flushlsn; - XLogRecPtr ps_applylsn; - TimestampTz ps_replytime; -} ReplicationFeedback; + XLogRecPtr last_received_lsn; + XLogRecPtr disk_consistent_lsn; + XLogRecPtr remote_consistent_lsn; + TimestampTz replytime; +} PageserverFeedback; typedef struct WalproposerShmemState { slock_t mutex; - ReplicationFeedback feedback; + PageserverFeedback feedback; term_t mineLastElectedTerm; pg_atomic_uint64 backpressureThrottlingTime; } WalproposerShmemState; @@ -320,10 +320,10 @@ typedef struct AppendResponse /* Feedback recieved from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ - ReplicationFeedback rf; + PageserverFeedback rf; } AppendResponse; -/* ReplicationFeedback is extensible part of the message that is parsed separately */ +/* PageserverFeedback is extensible part of the message that is parsed separately */ /* Other fields are fixed part */ #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf) @@ -383,13 +383,13 @@ extern void WalProposerSync(int argc, char *argv[]); extern void WalProposerMain(Datum main_arg); extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos); extern void WalProposerPoll(void); -extern void ParseReplicationFeedbackMessage(StringInfo reply_message, - ReplicationFeedback *rf); +extern void ParsePageserverFeedbackMessage(StringInfo reply_message, + PageserverFeedback *rf); extern void StartProposerReplication(StartReplicationCmd *cmd); extern Size WalproposerShmemSize(void); extern bool WalproposerShmemInit(void); -extern void replication_feedback_set(ReplicationFeedback *rf); +extern void replication_feedback_set(PageserverFeedback *rf); extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn); /* libpqwalproposer hooks & helper type */ diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index c3077b6dc5..2aaa17bfc5 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -255,7 +255,7 @@ pub struct TimelineCollector { epoch_start_lsn: GenericGaugeVec, peer_horizon_lsn: GenericGaugeVec, remote_consistent_lsn: GenericGaugeVec, - feedback_ps_write_lsn: GenericGaugeVec, + ps_last_received_lsn: GenericGaugeVec, feedback_last_time_seconds: GenericGaugeVec, timeline_active: GenericGaugeVec, wal_backup_active: GenericGaugeVec, @@ -339,15 +339,15 @@ impl TimelineCollector { .unwrap(); descs.extend(remote_consistent_lsn.desc().into_iter().cloned()); - let feedback_ps_write_lsn = GenericGaugeVec::new( + let ps_last_received_lsn = GenericGaugeVec::new( Opts::new( - "safekeeper_feedback_ps_write_lsn", + "safekeeper_ps_last_received_lsn", "Last LSN received by the pageserver, acknowledged in the feedback", ), &["tenant_id", "timeline_id"], ) .unwrap(); - descs.extend(feedback_ps_write_lsn.desc().into_iter().cloned()); + descs.extend(ps_last_received_lsn.desc().into_iter().cloned()); let feedback_last_time_seconds = GenericGaugeVec::new( Opts::new( @@ -458,7 +458,7 @@ impl TimelineCollector { epoch_start_lsn, peer_horizon_lsn, remote_consistent_lsn, - feedback_ps_write_lsn, + ps_last_received_lsn, feedback_last_time_seconds, timeline_active, wal_backup_active, @@ -489,7 +489,7 @@ impl Collector for TimelineCollector { self.epoch_start_lsn.reset(); self.peer_horizon_lsn.reset(); self.remote_consistent_lsn.reset(); - self.feedback_ps_write_lsn.reset(); + self.ps_last_received_lsn.reset(); self.feedback_last_time_seconds.reset(); self.timeline_active.reset(); self.wal_backup_active.reset(); @@ -514,11 +514,11 @@ impl Collector for TimelineCollector { let timeline_id = tli.ttid.timeline_id.to_string(); let labels = &[tenant_id.as_str(), timeline_id.as_str()]; - let mut most_advanced: Option = None; + let mut most_advanced: Option = None; for replica in tli.replicas.iter() { if let Some(replica_feedback) = replica.pageserver_feedback { if let Some(current) = most_advanced { - if current.ps_writelsn < replica_feedback.ps_writelsn { + if current.last_received_lsn < replica_feedback.last_received_lsn { most_advanced = Some(replica_feedback); } } else { @@ -568,11 +568,10 @@ impl Collector for TimelineCollector { .set(tli.wal_storage.flush_wal_seconds); if let Some(feedback) = most_advanced { - self.feedback_ps_write_lsn + self.ps_last_received_lsn .with_label_values(labels) - .set(feedback.ps_writelsn); - if let Ok(unix_time) = feedback.ps_replytime.duration_since(SystemTime::UNIX_EPOCH) - { + .set(feedback.last_received_lsn); + if let Ok(unix_time) = feedback.replytime.duration_since(SystemTime::UNIX_EPOCH) { self.feedback_last_time_seconds .with_label_values(labels) .set(unix_time.as_secs()); @@ -599,7 +598,7 @@ impl Collector for TimelineCollector { mfs.extend(self.epoch_start_lsn.collect()); mfs.extend(self.peer_horizon_lsn.collect()); mfs.extend(self.remote_consistent_lsn.collect()); - mfs.extend(self.feedback_ps_write_lsn.collect()); + mfs.extend(self.ps_last_received_lsn.collect()); mfs.extend(self.feedback_last_time_seconds.collect()); mfs.extend(self.timeline_active.collect()); mfs.extend(self.wal_backup_active.collect()); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index d8fe36d7f8..10b4842cbd 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -18,7 +18,7 @@ use crate::control_file; use crate::send_wal::HotStandbyFeedback; use crate::wal_storage; -use pq_proto::{ReplicationFeedback, SystemId}; +use pq_proto::{PageserverFeedback, SystemId}; use utils::{ bin_ser::LeSer, id::{NodeId, TenantId, TenantTimelineId, TimelineId}, @@ -360,7 +360,7 @@ pub struct AppendResponse { // a criterion for walproposer --sync mode exit pub commit_lsn: Lsn, pub hs_feedback: HotStandbyFeedback, - pub pageserver_feedback: ReplicationFeedback, + pub pageserver_feedback: PageserverFeedback, } impl AppendResponse { @@ -370,7 +370,7 @@ impl AppendResponse { flush_lsn: Lsn(0), commit_lsn: Lsn(0), hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), } } } @@ -708,7 +708,7 @@ where commit_lsn: self.state.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), - pageserver_feedback: ReplicationFeedback::empty(), + pageserver_feedback: PageserverFeedback::empty(), }; trace!("formed AppendResponse {:?}", ar); ar diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index b533e87c5b..a6ca89efa4 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -11,7 +11,7 @@ use postgres_backend::PostgresBackend; use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError}; use postgres_ffi::get_current_timestamp; use postgres_ffi::{TimestampTz, MAX_SEND_SIZE}; -use pq_proto::{BeMessage, ReplicationFeedback, WalSndKeepAlive, XLogDataBody}; +use pq_proto::{BeMessage, PageserverFeedback, WalSndKeepAlive, XLogDataBody}; use serde::{Deserialize, Serialize}; use tokio::io::{AsyncRead, AsyncWrite}; @@ -319,11 +319,9 @@ impl ReplyReader { // pageserver sends this. // Note: deserializing is on m[9..] because we skip the tag byte and len bytes. let buf = Bytes::copy_from_slice(&msg[9..]); - let reply = ReplicationFeedback::parse(buf); + let reply = PageserverFeedback::parse(buf); - trace!("ReplicationFeedback is {:?}", reply); - // Only pageserver sends ReplicationFeedback, so set the flag. - // This replica is the source of information to resend to compute. + trace!("PageserverFeedback is {:?}", reply); self.feedback.pageserver_feedback = Some(reply); self.tli diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 931062db1a..9dd8a63cf0 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -4,7 +4,7 @@ use anyhow::{anyhow, bail, Result}; use parking_lot::{Mutex, MutexGuard}; use postgres_ffi::XLogSegNo; -use pq_proto::ReplicationFeedback; +use pq_proto::PageserverFeedback; use serde::Serialize; use std::cmp::{max, min}; use std::path::PathBuf; @@ -91,7 +91,7 @@ pub struct ReplicaState { /// combined hot standby feedback from all replicas pub hs_feedback: HotStandbyFeedback, /// Replication specific feedback received from pageserver, if any - pub pageserver_feedback: Option, + pub pageserver_feedback: Option, } impl Default for ReplicaState { @@ -276,7 +276,7 @@ impl SharedState { // if let Some(pageserver_feedback) = state.pageserver_feedback { if let Some(acc_feedback) = acc.pageserver_feedback { - if acc_feedback.ps_writelsn < pageserver_feedback.ps_writelsn { + if acc_feedback.last_received_lsn < pageserver_feedback.last_received_lsn { warn!("More than one pageserver is streaming WAL for the timeline. Feedback resolving is not fully supported yet."); acc.pageserver_feedback = Some(pageserver_feedback); } @@ -287,12 +287,12 @@ impl SharedState { // last lsn received by pageserver // FIXME if multiple pageservers are streaming WAL, last_received_lsn must be tracked per pageserver. // See https://github.com/neondatabase/neon/issues/1171 - acc.last_received_lsn = Lsn::from(pageserver_feedback.ps_writelsn); + acc.last_received_lsn = Lsn::from(pageserver_feedback.last_received_lsn); // When at least one pageserver has preserved data up to remote_consistent_lsn, // safekeeper is free to delete it, so choose max of all pageservers. acc.remote_consistent_lsn = max( - Lsn::from(pageserver_feedback.ps_applylsn), + Lsn::from(pageserver_feedback.remote_consistent_lsn), acc.remote_consistent_lsn, ); } @@ -585,7 +585,7 @@ impl Timeline { let replica_state = shared_state.replicas[replica_id].unwrap(); let reported_remote_consistent_lsn = replica_state .pageserver_feedback - .map(|f| Lsn(f.ps_applylsn)) + .map(|f| Lsn(f.remote_consistent_lsn)) .unwrap_or(Lsn::INVALID); let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet (reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet. From cf5cfe6d718fa61ccd6d2e6d97d627a58ab2cd03 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 3 Apr 2023 13:26:45 +0300 Subject: [PATCH 23/25] fix: metric used for alerting threshold on staging (#3932) This should remove the too eager alerts from staging. --- .github/ansible/staging.eu-west-1.hosts.yaml | 1 + .github/ansible/staging.us-east-2.hosts.yaml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index e8d0bb1dc7..b634345c72 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,6 +8,7 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 # TODO: learn typical resident-size growth rate [GiB/minute] and configure diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index 4ef51651fc..c1ceaa61ee 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,6 +8,7 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min + evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 # TODO: learn typical resident-size growth rate [GiB/minute] and configure From a415670bc34825cbee8d548bff94abe3630c57fa Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 3 Apr 2023 14:15:41 +0300 Subject: [PATCH 24/25] feat: log evictions (#3930) this will help log analysis with the counterpart of already logging all remote download needs and downloads. ended up with a easily regexable output in the final round. --- pageserver/src/tenant/timeline.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b40cb05411..e80e32644b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1127,6 +1127,9 @@ impl Timeline { self.metrics .evictions_with_low_residence_duration .observe(delta); + info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period"); + } else { + info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period"); } true From 45bf76eb05944e1356ad0b7158e90a3d4502a2da Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 3 Apr 2023 14:57:36 +0200 Subject: [PATCH 25/25] enable layer eviction by default in prod (#3933) Leave disk_usage_based_eviction above the current max usage in prod (82%ish), so that deploying this commit won't trigger disk_usage_based_eviction. As indicated in the TODO, we'll decrease the value to 80% later. Also update the staging YAMLs to use the anchor syntax for `evictions_low_residence_duration_metric_threshold` like we do in the prod YAMLs as of this patch. --- .github/ansible/prod.ap-southeast-1.hosts.yaml | 10 ++++++++++ .github/ansible/prod.eu-central-1.hosts.yaml | 10 ++++++++++ .github/ansible/prod.us-east-2.hosts.yaml | 10 ++++++++++ .github/ansible/prod.us-west-2.hosts.yaml | 10 ++++++++++ .github/ansible/staging.eu-west-1.hosts.yaml | 8 ++------ .github/ansible/staging.us-east-2.hosts.yaml | 8 ++------ 6 files changed, 44 insertions(+), 12 deletions(-) diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml index 8ccb67b04a..c185086eef 100644 --- a/.github/ansible/prod.ap-southeast-1.hosts.yaml +++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml index b3cd5de01c..0a0f974ea4 100644 --- a/.github/ansible/prod.eu-central-1.hosts.yaml +++ b/.github/ansible/prod.eu-central-1.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml index 22c705e1cf..4427bb344e 100644 --- a/.github/ansible/prod.us-east-2.hosts.yaml +++ b/.github/ansible/prod.us-east-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml index f03e2d9435..53626b4f59 100644 --- a/.github/ansible/prod.us-west-2.hosts.yaml +++ b/.github/ansible/prod.us-west-2.hosts.yaml @@ -8,6 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events metric_collection_interval: 10min + disk_usage_based_eviction: + max_usage_pct: 85 # TODO: decrease to 80 after all pageservers are below 80 + min_avail_bytes: 0 + period: "10s" + tenant_config: + eviction_policy: + kind: "LayerAccessThreshold" + period: "10m" + threshold: &default_eviction_threshold "24h" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml index b634345c72..34c8e77280 100644 --- a/.github/ansible/staging.eu-west-1.hosts.yaml +++ b/.github/ansible/staging.eu-west-1.hosts.yaml @@ -8,20 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min - evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 - # TODO: learn typical resident-size growth rate [GiB/minute] and configure - # min_avail_bytes such that we have X minutes of headroom. min_avail_bytes: 0 - # We assume that the worst-case growth rate is small enough that we can - # catch above-threshold conditions by checking every 10s. period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}" diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml index c1ceaa61ee..94f2be83a4 100644 --- a/.github/ansible/staging.us-east-2.hosts.yaml +++ b/.github/ansible/staging.us-east-2.hosts.yaml @@ -8,20 +8,16 @@ storage: pg_distrib_dir: /usr/local metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events metric_collection_interval: 10min - evictions_low_residence_duration_metric_threshold: "20m" disk_usage_based_eviction: max_usage_pct: 80 - # TODO: learn typical resident-size growth rate [GiB/minute] and configure - # min_avail_bytes such that we have X minutes of headroom. min_avail_bytes: 0 - # We assume that the worst-case growth rate is small enough that we can - # catch above-threshold conditions by checking every 10s. period: "10s" tenant_config: eviction_policy: kind: "LayerAccessThreshold" period: "20m" - threshold: "20m" + threshold: &default_eviction_threshold "20m" + evictions_low_residence_duration_metric_threshold: *default_eviction_threshold remote_storage: bucket_name: "{{ bucket_name }}" bucket_region: "{{ bucket_region }}"