From f0d15cee6f2921bfbd90d41ebdd3d11e4015396e Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 11 Dec 2023 13:21:02 +0200 Subject: [PATCH 01/18] build: update azure-* to 0.17 (#6081) this is a drive-by upgrade while we refresh the access tokens at the same time. --- Cargo.lock | 51 +++++++++++++++++++++++++++++++++++++-------------- Cargo.toml | 8 ++++---- 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1364c9d84f..da480b8e0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -184,7 +184,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" dependencies = [ "concurrent-queue", - "event-listener", + "event-listener 2.5.3", "futures-core", ] @@ -205,11 +205,13 @@ dependencies = [ [[package]] name = "async-lock" -version = "2.8.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +checksum = "7125e42787d53db9dd54261812ef17e937c95a51e4d291373b670342fa44310c" dependencies = [ - "event-listener", + "event-listener 4.0.0", + "event-listener-strategy", + "pin-project-lite", ] [[package]] @@ -692,9 +694,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e29286b9edfdd6f2c7e9d970bb5b015df8621258acab9ecfcea09b2d7692467" +checksum = "4ccd63c07d1fbfb3d4543d7ea800941bf5a30db1911b9b9e4db3b2c4210a434f" dependencies = [ "async-trait", "base64 0.21.1", @@ -719,9 +721,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.16.2" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b67b337346da8739e91ea1e9400a6ebc9bc54e0b2af1d23c9bcd565950588f9" +checksum = "8bd7ea32ca7eb66ff4757f83baac702ff11d469e5de365b6bc6f79f9c25d3436" dependencies = [ "async-lock", "async-trait", @@ -740,9 +742,9 @@ dependencies = [ [[package]] name = "azure_storage" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bed0ccefde57930b2886fd4aed1f70ac469c197b8c2e94828290d71bcbdb5d97" +checksum = "83ca0a07f89fd72a006da4713e93af3d6c44a693e61a1c3c2e7985de39c182e8" dependencies = [ "RustyXML", "async-trait", @@ -762,9 +764,9 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91a52da2d192cfe43759f61e8bb31a5969f1722d5b85ac89627f356ad674ab4" +checksum = "8096c04d370118323c42b2752aa1883e4880a56ef65239f317b359f263b6e194" dependencies = [ "RustyXML", "azure_core", @@ -1686,6 +1688,27 @@ version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" +[[package]] +name = "event-listener" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "770d968249b5d99410d61f5bf89057f3199a077a04d087092f58e7d10692baae" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3" +dependencies = [ + "event-listener 4.0.0", + "pin-project-lite", +] + [[package]] name = "fail" version = "0.5.1" @@ -3678,9 +3701,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" dependencies = [ "memchr", "serde", diff --git a/Cargo.toml b/Cargo.toml index 33f56e084f..b5eece5e35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,10 +38,10 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.16" -azure_identity = "0.16" -azure_storage = "0.16" -azure_storage_blobs = "0.16" +azure_core = "0.17" +azure_identity = "0.17" +azure_storage = "0.17" +azure_storage_blobs = "0.17" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" From 66a7a226f8db9a96a152bf98cc23d599a80eb554 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 11 Dec 2023 15:13:27 +0000 Subject: [PATCH 02/18] test_runner: use toml instead of formatted strings (#6088) ## Problem A bunch of refactorings extracted from https://github.com/neondatabase/neon/pull/6087 (not required for it); the most significant one is using toml instead of formatted strings. ## Summary of changes - Use toml instead of formatted strings for config - Skip pageserver log check if `pageserver.log` doesn't exist - `chmod -x test_runner/regress/test_config.py` --- test_runner/fixtures/neon_fixtures.py | 132 ++++++++++------------ test_runner/fixtures/remote_storage.py | 22 ++-- test_runner/regress/test_compatibility.py | 4 +- test_runner/regress/test_config.py | 0 4 files changed, 74 insertions(+), 84 deletions(-) mode change 100755 => 100644 test_runner/regress/test_config.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9e0beeb4d1..c569b63d4e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -28,6 +28,7 @@ import jwt import psycopg2 import pytest import requests +import toml from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest @@ -436,7 +437,7 @@ class NeonEnvBuilder: # Pageserver remote storage self.pageserver_remote_storage = pageserver_remote_storage # Safekeepers remote storage - self.sk_remote_storage: Optional[RemoteStorage] = None + self.safekeepers_remote_storage: Optional[RemoteStorage] = None self.broker = broker self.run_id = run_id @@ -534,9 +535,11 @@ class NeonEnvBuilder: self.pageserver_remote_storage = ret def enable_safekeeper_remote_storage(self, kind: RemoteStorageKind): - assert self.sk_remote_storage is None, "sk_remote_storage already configured" + assert ( + self.safekeepers_remote_storage is None + ), "safekeepers_remote_storage already configured" - self.sk_remote_storage = self._configure_and_create_remote_storage( + self.safekeepers_remote_storage = self._configure_and_create_remote_storage( kind, RemoteStorageUser.SAFEKEEPER ) @@ -589,7 +592,7 @@ class NeonEnvBuilder: directory_to_clean.rmdir() def cleanup_remote_storage(self): - for x in [self.pageserver_remote_storage, self.sk_remote_storage]: + for x in [self.pageserver_remote_storage, self.safekeepers_remote_storage]: if isinstance(x, S3Storage): x.do_cleanup() @@ -693,7 +696,7 @@ class NeonEnv: self.pageservers: List[NeonPageserver] = [] self.broker = config.broker self.pageserver_remote_storage = config.pageserver_remote_storage - self.safekeepers_remote_storage = config.sk_remote_storage + self.safekeepers_remote_storage = config.safekeepers_remote_storage self.pg_version = config.pg_version # Binary path for pageserver, safekeeper, etc self.neon_binpath = config.neon_binpath @@ -718,25 +721,17 @@ class NeonEnv: self.attachment_service = None # Create a config file corresponding to the options - toml = textwrap.dedent( - f""" - default_tenant_id = '{config.initial_tenant}' - """ - ) + cfg: Dict[str, Any] = { + "default_tenant_id": str(self.initial_tenant), + "broker": { + "listen_addr": self.broker.listen_addr(), + }, + "pageservers": [], + "safekeepers": [], + } if self.control_plane_api is not None: - toml += textwrap.dedent( - f""" - control_plane_api = '{self.control_plane_api}' - """ - ) - - toml += textwrap.dedent( - f""" - [broker] - listen_addr = '{self.broker.listen_addr()}' - """ - ) + cfg["control_plane_api"] = self.control_plane_api # Create config for pageserver http_auth_type = "NeonJWT" if config.auth_enabled else "Trust" @@ -749,26 +744,24 @@ class NeonEnv: http=self.port_distributor.get_port(), ) - toml += textwrap.dedent( - f""" - [[pageservers]] - id={ps_id} - listen_pg_addr = 'localhost:{pageserver_port.pg}' - listen_http_addr = 'localhost:{pageserver_port.http}' - pg_auth_type = '{pg_auth_type}' - http_auth_type = '{http_auth_type}' - """ - ) - + ps_cfg: Dict[str, Any] = { + "id": ps_id, + "listen_pg_addr": f"localhost:{pageserver_port.pg}", + "listen_http_addr": f"localhost:{pageserver_port.http}", + "pg_auth_type": pg_auth_type, + "http_auth_type": http_auth_type, + } # Create a corresponding NeonPageserver object self.pageservers.append( NeonPageserver( self, ps_id, port=pageserver_port, - config_override=config.pageserver_config_override, + config_override=self.pageserver_config_override, ) ) + cfg["pageservers"].append(ps_cfg) + # Create config and a Safekeeper object for each safekeeper for i in range(1, config.num_safekeepers + 1): port = SafekeeperPort( @@ -777,32 +770,22 @@ class NeonEnv: http=self.port_distributor.get_port(), ) id = config.safekeepers_id_start + i # assign ids sequentially - toml += textwrap.dedent( - f""" - [[safekeepers]] - id = {id} - pg_port = {port.pg} - pg_tenant_only_port = {port.pg_tenant_only} - http_port = {port.http} - sync = {'true' if config.safekeepers_enable_fsync else 'false'}""" - ) + sk_cfg: Dict[str, Any] = { + "id": id, + "pg_port": port.pg, + "pg_tenant_only_port": port.pg_tenant_only, + "http_port": port.http, + "sync": config.safekeepers_enable_fsync, + } if config.auth_enabled: - toml += textwrap.dedent( - """ - auth_enabled = true - """ - ) - if config.sk_remote_storage is not None: - toml += textwrap.dedent( - f""" - remote_storage = "{remote_storage_to_toml_inline_table(config.sk_remote_storage)}" - """ - ) - safekeeper = Safekeeper(env=self, id=id, port=port) - self.safekeepers.append(safekeeper) + sk_cfg["auth_enabled"] = True + if self.safekeepers_remote_storage is not None: + sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table() + self.safekeepers.append(Safekeeper(env=self, id=id, port=port)) + cfg["safekeepers"].append(sk_cfg) - log.info(f"Config: {toml}") - self.neon_cli.init(toml) + log.info(f"Config: {cfg}") + self.neon_cli.init(cfg) def start(self): # Start up broker, pageserver and all safekeepers @@ -1288,10 +1271,10 @@ class NeonCli(AbstractNeonCli): def init( self, - config_toml: str, + config: Dict[str, Any], ) -> "subprocess.CompletedProcess[str]": with tempfile.NamedTemporaryFile(mode="w+") as tmp: - tmp.write(config_toml) + tmp.write(toml.dumps(config)) tmp.flush() cmd = ["init", f"--config={tmp.name}", "--pg-version", self.env.pg_version] @@ -1732,8 +1715,13 @@ class NeonPageserver(PgProtocol): return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}")) def assert_no_errors(self): - logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") - errors = scan_pageserver_log_for_errors(logfile, self.allowed_errors) + logfile = self.workdir / "pageserver.log" + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return + + with logfile.open("r") as f: + errors = scan_pageserver_log_for_errors(f, self.allowed_errors) for _lineno, error in errors: log.info(f"not allowed error: {error.strip()}") @@ -1757,7 +1745,10 @@ class NeonPageserver(PgProtocol): def log_contains(self, pattern: str) -> Optional[str]: """Check that the pageserver log contains a line that matches the given regex""" - logfile = open(os.path.join(self.workdir, "pageserver.log"), "r") + logfile = self.workdir / "pageserver.log" + if not logfile.exists(): + log.warning(f"Skipping log check: {logfile} does not exist") + return None contains_re = re.compile(pattern) @@ -1766,14 +1757,11 @@ class NeonPageserver(PgProtocol): # no guarantee it is already present in the log file. This hasn't # been a problem in practice, our python tests are not fast enough # to hit that race condition. - while True: - line = logfile.readline() - if not line: - break - - if contains_re.search(line): - # found it! - return line + with logfile.open("r") as f: + for line in f: + if contains_re.search(line): + # found it! + return line return None @@ -3355,8 +3343,6 @@ def parse_project_git_version_output(s: str) -> str: The information is generated by utils::project_git_version! """ - import re - res = re.search(r"git(-env)?:([0-9a-fA-F]{8,40})(-\S+)?", s) if res and (commit := res.group(2)): return commit diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py index 565e5fa7f8..824531bea4 100644 --- a/test_runner/fixtures/remote_storage.py +++ b/test_runner/fixtures/remote_storage.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union import boto3 +import toml from mypy_boto3_s3 import S3Client from fixtures.log_helper import log @@ -133,7 +134,10 @@ class LocalFsStorage: return json.load(f) def to_toml_inline_table(self) -> str: - return f"local_path='{self.root}'" + rv = { + "local_path": str(self.root), + } + return toml.TomlEncoder().dump_inline_table(rv) def cleanup(self): # no cleanup is done here, because there's NeonEnvBuilder.cleanup_local_storage which will remove everything, including localfs files @@ -174,18 +178,18 @@ class S3Storage: ) def to_toml_inline_table(self) -> str: - s = [ - f"bucket_name='{self.bucket_name}'", - f"bucket_region='{self.bucket_region}'", - ] + rv = { + "bucket_name": self.bucket_name, + "bucket_region": self.bucket_region, + } if self.prefix_in_bucket is not None: - s.append(f"prefix_in_bucket='{self.prefix_in_bucket}'") + rv["prefix_in_bucket"] = self.prefix_in_bucket if self.endpoint is not None: - s.append(f"endpoint='{self.endpoint}'") + rv["endpoint"] = self.endpoint - return ",".join(s) + return toml.TomlEncoder().dump_inline_table(rv) def do_cleanup(self): if not self.cleanup: @@ -384,4 +388,4 @@ def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: if not isinstance(remote_storage, (LocalFsStorage, S3Storage)): raise Exception("invalid remote storage type") - return f"{{{remote_storage.to_toml_inline_table()}}}" + return remote_storage.to_toml_inline_table() diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index f3c6af4427..35963c0d41 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -7,7 +7,7 @@ from pathlib import Path from typing import Any, List, Optional import pytest -import toml # TODO: replace with tomllib for Python >= 3.11 +import toml from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonCli, @@ -411,7 +411,7 @@ def check_neon_works( config.initial_tenant = snapshot_config["default_tenant_id"] config.pg_distrib_dir = pg_distrib_dir config.remote_storage = None - config.sk_remote_storage = None + config.safekeepers_remote_storage = None # Use the "target" binaries to launch the storage nodes config_target = config diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py old mode 100755 new mode 100644 From f1fc1fd639a2b061c6f1b5c2cb116eb3f4741d5b Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Dec 2023 15:52:33 +0000 Subject: [PATCH 03/18] pageserver: further refactoring from TenantId to TenantShardId (#6059) ## Problem In https://github.com/neondatabase/neon/pull/5957, the most essential types were updated to use TenantShardId rather than TenantId. That unblocked other work, but didn't fully enable running multiple shards from the same tenant on the same pageserver. ## Summary of changes - Use TenantShardId in page cache key for materialized pages - Update mgr.rs get_tenant() and list_tenants() functions to use a shard id, and update all callers. - Eliminate the exactly_one_or_none helper in mgr.rs and all code that used it - Convert timeline HTTP routes to use tenant_shard_id Note on page cache: ``` struct MaterializedPageHashKey { /// Why is this TenantShardId rather than TenantId? /// /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this /// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are /// special-cased in some other way. tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: Key, } ``` --- control_plane/src/bin/neon_local.rs | 2 +- control_plane/src/tenant_migration.rs | 2 +- libs/pageserver_api/src/models.rs | 8 +- libs/pageserver_api/src/shard.rs | 5 + pageserver/src/consumption_metrics.rs | 14 +- pageserver/src/consumption_metrics/metrics.rs | 10 +- pageserver/src/http/routes.rs | 222 ++++++++++-------- pageserver/src/metrics.rs | 38 ++- pageserver/src/page_cache.rs | 24 +- pageserver/src/task_mgr.rs | 41 ++-- pageserver/src/tenant.rs | 4 +- pageserver/src/tenant/delete.rs | 4 +- pageserver/src/tenant/mgr.rs | 104 +++----- .../src/tenant/remote_timeline_client.rs | 2 +- pageserver/src/tenant/storage_layer/layer.rs | 2 +- pageserver/src/tenant/tasks.rs | 14 +- pageserver/src/tenant/timeline.rs | 49 ++-- pageserver/src/tenant/timeline/delete.rs | 6 +- .../src/tenant/timeline/eviction_task.rs | 4 +- pageserver/src/tenant/timeline/walreceiver.rs | 21 +- .../walreceiver/walreceiver_connection.rs | 2 +- 21 files changed, 299 insertions(+), 279 deletions(-) diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 8d53a6a658..6f0b929ac6 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -168,7 +168,7 @@ fn print_timelines_tree( info: t.clone(), children: BTreeSet::new(), name: timeline_name_mappings - .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)), + .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)), }, ) }) diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs index c0c44e279f..fbb0358158 100644 --- a/control_plane/src/tenant_migration.rs +++ b/control_plane/src/tenant_migration.rs @@ -165,7 +165,7 @@ pub fn migrate_tenant( let found = other_ps_tenants .into_iter() .map(|t| t.id) - .any(|i| i == tenant_id); + .any(|i| i.tenant_id == tenant_id); if !found { continue; } diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 495a58e865..2572bcf74f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -357,7 +357,7 @@ pub enum TenantAttachmentStatus { #[derive(Serialize, Deserialize, Clone)] pub struct TenantInfo { - pub id: TenantId, + pub id: TenantShardId, // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's pub state: TenantState, /// Sum of the size of all layer files. @@ -369,7 +369,7 @@ pub struct TenantInfo { /// This represents the output of the "timeline_detail" and "timeline_list" API calls. #[derive(Debug, Serialize, Deserialize, Clone)] pub struct TimelineInfo { - pub tenant_id: TenantId, + pub tenant_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, @@ -823,7 +823,7 @@ mod tests { fn test_tenantinfo_serde() { // Test serialization/deserialization of TenantInfo let original_active = TenantInfo { - id: TenantId::generate(), + id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Active, current_physical_size: Some(42), attachment_status: TenantAttachmentStatus::Attached, @@ -840,7 +840,7 @@ mod tests { }); let original_broken = TenantInfo { - id: TenantId::generate(), + id: TenantShardId::unsharded(TenantId::generate()), state: TenantState::Broken { reason: "reason".into(), backtrace: "backtrace info".into(), diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 9e83e0eee2..052fbd1402 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -76,6 +76,11 @@ impl TenantShardId { pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { ShardSlug(self) } + + /// Convenience for code that has special behavior on the 0th shard. + pub fn is_zero(&self) -> bool { + self.shard_number == ShardNumber(0) + } } /// Formatting helper diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 7ad6a0f890..bb13bdd5e5 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -269,12 +269,18 @@ async fn calculate_synthetic_size_worker( } }; - for (tenant_id, tenant_state) in tenants { + for (tenant_shard_id, tenant_state) in tenants { if tenant_state != TenantState::Active { continue; } - if let Ok(tenant) = mgr::get_tenant(tenant_id, true) { + if !tenant_shard_id.is_zero() { + // We only send consumption metrics from shard 0, so don't waste time calculating + // synthetic size on other shards. + continue; + } + + if let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) { // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks? // We can put in some prioritization for consumption metrics. // Same for the loop that fetches computed metrics. @@ -286,7 +292,9 @@ async fn calculate_synthetic_size_worker( { return Ok(()); } - error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}"); + error!( + "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}" + ); } } } diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs index 918e45ea9e..0b827816bc 100644 --- a/pageserver/src/consumption_metrics/metrics.rs +++ b/pageserver/src/consumption_metrics/metrics.rs @@ -2,7 +2,6 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi use chrono::{DateTime, Utc}; use consumption_metrics::EventType; use futures::stream::StreamExt; -use pageserver_api::shard::ShardNumber; use std::{sync::Arc, time::SystemTime}; use utils::{ id::{TenantId, TimelineId}, @@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics( }; let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move { - if state != TenantState::Active { + if state != TenantState::Active || !id.is_zero() { None } else { crate::tenant::mgr::get_tenant(id, true) .ok() - .map(|tenant| (id, tenant)) + .map(|tenant| (id.tenant_id, tenant)) } }); @@ -229,11 +228,6 @@ where while let Some((tenant_id, tenant)) = tenants.next().await { let mut tenant_resident_size = 0; - // Sharded tenants report all consumption metrics from shard zero - if tenant.tenant_shard_id().shard_number != ShardNumber(0) { - continue; - } - for timeline in tenant.list_timelines() { let timeline_id = timeline.timeline_id; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 9e41d912c2..b9b0250671 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -352,8 +352,7 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); let info = TimelineInfo { - // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id - tenant_id: timeline.tenant_shard_id.tenant_id, + tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id, ancestor_lsn, @@ -480,15 +479,15 @@ async fn timeline_list_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let response_data = async { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; let timelines = tenant.list_timelines(); let mut response_data = Vec::with_capacity(timelines.len()); @@ -507,7 +506,9 @@ async fn timeline_list_handler( } Ok::, ApiError>(response_data) } - .instrument(info_span!("timeline_list", %tenant_id)) + .instrument(info_span!("timeline_list", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, response_data) @@ -517,17 +518,17 @@ async fn timeline_detail_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let include_non_incremental_logical_size: Option = parse_query_param(&request, "include-non-incremental-logical-size")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; // Logical size calculation needs downloading. let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); let timeline_info = async { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; let timeline = tenant .get_timeline(timeline_id, false) @@ -544,7 +545,10 @@ async fn timeline_detail_handler( Ok::<_, ApiError>(timeline_info) } - .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_detail", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug(), + %timeline_id)) .await?; json_response(StatusCode::OK, timeline_info) @@ -554,8 +558,15 @@ async fn get_lsn_by_timestamp_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + if !tenant_shard_id.is_zero() { + // Requires SLRU contents, which are only stored on shard zero + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } let version: Option = parse_query_param(&request, "version")?; @@ -567,7 +578,7 @@ async fn get_lsn_by_timestamp_handler( let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let result = timeline .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx) .await?; @@ -602,8 +613,15 @@ async fn get_timestamp_of_lsn_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + if !tenant_shard_id.is_zero() { + // Requires SLRU contents, which are only stored on shard zero + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; @@ -613,7 +631,7 @@ async fn get_timestamp_of_lsn_handler( .map_err(ApiError::BadRequest)?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?; match result { @@ -805,11 +823,11 @@ async fn tenant_status( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let tenant_info = async { - let tenant = mgr::get_tenant(tenant_id, false)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; // Calculate total physical size of all timelines let mut current_physical_size = 0; @@ -819,13 +837,15 @@ async fn tenant_status( let state = tenant.current_state(); Result::<_, ApiError>::Ok(TenantInfo { - id: tenant_id, + id: tenant_shard_id, state: state.clone(), current_physical_size: Some(current_physical_size), attachment_status: state.attachment_status(), }) } - .instrument(info_span!("tenant_status_handler", %tenant_id)) + .instrument(info_span!("tenant_status_handler", + tenant_id = %tenant_shard_id.tenant_id, + shard_id = %tenant_shard_id.shard_slug())) .await?; json_response(StatusCode::OK, tenant_info) @@ -868,14 +888,20 @@ async fn tenant_size_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let inputs_only: Option = parse_query_param(&request, "inputs_only")?; let retention_period: Option = parse_query_param(&request, "retention_period")?; let headers = request.headers(); let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; + + if !tenant_shard_id.is_zero() { + return Err(ApiError::BadRequest(anyhow!( + "Size calculations are only available on shard zero" + ))); + } // this can be long operation let inputs = tenant @@ -927,7 +953,7 @@ async fn tenant_size_handler( json_response( StatusCode::OK, TenantHistorySize { - id: tenant_id, + id: tenant_shard_id.tenant_id, size: sizes.as_ref().map(|x| x.total_size), segment_sizes: sizes.map(|x| x.segments), inputs, @@ -939,14 +965,14 @@ async fn layer_map_info_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let reset: LayerAccessStatsReset = parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset); - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let layer_map_info = timeline.layer_map_info(reset).await; json_response(StatusCode::OK, layer_map_info) @@ -956,13 +982,12 @@ async fn layer_download_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let downloaded = timeline .download_layer(layer_file_name) .await @@ -973,7 +998,7 @@ async fn layer_download_handler( Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, - format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"), + format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } @@ -982,12 +1007,12 @@ async fn evict_timeline_layer_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let layer_file_name = get_request_param(&request, "layer_file_name")?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let evicted = timeline .evict_layer(layer_file_name) .await @@ -998,7 +1023,7 @@ async fn evict_timeline_layer_handler( Some(false) => json_response(StatusCode::NOT_MODIFIED, ()), None => json_response( StatusCode::BAD_REQUEST, - format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"), + format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"), ), } } @@ -1130,10 +1155,10 @@ async fn get_tenant_config_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let tenant = mgr::get_tenant(tenant_id, false)?; + let tenant = mgr::get_tenant(tenant_shard_id, false)?; let response = HashMap::from([ ( @@ -1227,9 +1252,9 @@ async fn handle_tenant_break( r: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?; - let tenant = crate::tenant::mgr::get_tenant(tenant_id, true) + let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true) .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?; tenant.set_broken("broken from test".to_owned()).await; @@ -1270,14 +1295,15 @@ async fn timeline_gc_handler( mut request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let gc_req: TimelineGcRequest = json_request(&mut request).await?; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, cancel, &ctx).await?; + let wait_task_done = + mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?; let gc_result = wait_task_done .await .context("wait for gc task") @@ -1292,9 +1318,9 @@ async fn timeline_compact_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { @@ -1302,14 +1328,14 @@ async fn timeline_compact_handler( } async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; timeline .compact(&cancel, flags, &ctx) .await .map_err(|e| ApiError::InternalServerError(e.into()))?; json_response(StatusCode::OK, ()) } - .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1318,9 +1344,9 @@ async fn timeline_checkpoint_handler( request: Request, cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let mut flags = EnumSet::empty(); if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { @@ -1328,7 +1354,7 @@ async fn timeline_checkpoint_handler( } async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; timeline .freeze_and_flush() .await @@ -1340,7 +1366,7 @@ async fn timeline_checkpoint_handler( json_response(StatusCode::OK, ()) } - .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id)) + .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1348,12 +1374,12 @@ async fn timeline_download_remote_layers_handler_post( mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; match timeline.spawn_download_all_remote_layers(body).await { Ok(st) => json_response(StatusCode::ACCEPTED, st), Err(st) => json_response(StatusCode::CONFLICT, st), @@ -1364,11 +1390,11 @@ async fn timeline_download_remote_layers_handler_get( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let info = timeline .get_download_all_remote_layers_task_info() .context("task never started since last pageserver process start") @@ -1414,9 +1440,9 @@ async fn getpage_at_lsn_handler( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; struct Key(crate::repository::Key); @@ -1435,7 +1461,7 @@ async fn getpage_at_lsn_handler( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let page = timeline.get(key.0, lsn, &ctx).await?; @@ -1447,7 +1473,7 @@ async fn getpage_at_lsn_handler( .unwrap(), ) } - .instrument(info_span!("timeline_get", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } @@ -1455,9 +1481,9 @@ async fn timeline_collect_keyspace( request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { - let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; - check_permission(&request, Some(tenant_id))?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; struct Partitioning { keys: crate::keyspace::KeySpace, @@ -1526,7 +1552,7 @@ async fn timeline_collect_keyspace( async { let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); - let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?; + let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?; let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn()); let keys = timeline .collect_keyspace(at_lsn, &ctx) @@ -1535,15 +1561,15 @@ async fn timeline_collect_keyspace( json_response(StatusCode::OK, Partitioning { keys, at_lsn }) } - .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id)) + .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) .await } async fn active_timeline_of_active_tenant( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, ) -> Result, ApiError> { - let tenant = mgr::get_tenant(tenant_id, true)?; + let tenant = mgr::get_tenant(tenant_shard_id, true)?; tenant .get_timeline(timeline_id, true) .map_err(|e| ApiError::NotFound(e.into())) @@ -1820,23 +1846,25 @@ pub fn make_router( }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) - .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status)) + .get("/v1/tenant/:tenant_shard_id", |r| { + api_handler(r, tenant_status) + }) .delete("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_delete_handler) }) - .get("/v1/tenant/:tenant_id/synthetic_size", |r| { + .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| { api_handler(r, tenant_size_handler) }) .put("/v1/tenant/config", |r| { api_handler(r, update_tenant_config_handler) }) - .get("/v1/tenant/:tenant_id/config", |r| { + .get("/v1/tenant/:tenant_shard_id/config", |r| { api_handler(r, get_tenant_config_handler) }) .put("/v1/tenant/:tenant_shard_id/location_config", |r| { api_handler(r, put_tenant_location_config_handler) }) - .get("/v1/tenant/:tenant_id/timeline", |r| { + .get("/v1/tenant/:tenant_shard_id/timeline", |r| { api_handler(r, timeline_list_handler) }) .post("/v1/tenant/:tenant_shard_id/timeline", |r| { @@ -1857,47 +1885,50 @@ pub fn make_router( .post("/v1/tenant/:tenant_id/ignore", |r| { api_handler(r, tenant_ignore_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| { + .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_detail_handler) }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp", |r| api_handler(r, get_lsn_by_timestamp_handler), ) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn", |r| api_handler(r, get_timestamp_of_lsn_handler), ) - .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| { - api_handler(r, timeline_gc_handler) - }) - .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| { - testing_api_handler("run timeline compaction", r, timeline_compact_handler) - }) .put( - "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc", + |r| api_handler(r, timeline_gc_handler), + ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", + |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler), + ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), ) .post( - "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_post), ) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers", |r| api_handler(r, timeline_download_remote_layers_handler_get), ) .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| { api_handler(r, timeline_delete_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| { - api_handler(r, layer_map_info_handler) - }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer", + |r| api_handler(r, layer_map_info_handler), + ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, layer_download_handler), ) .delete( - "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name", |r| api_handler(r, evict_timeline_layer_handler), ) .put("/v1/disk_usage_eviction/run", |r| { @@ -1906,18 +1937,19 @@ pub fn make_router( .put("/v1/deletion_queue/flush", |r| { api_handler(r, deletion_queue_flush) }) - .put("/v1/tenant/:tenant_id/break", |r| { + .put("/v1/tenant/:tenant_shard_id/break", |r| { testing_api_handler("set tenant state to broken", r, handle_tenant_break) }) .get("/v1/panic", |r| api_handler(r, always_panic_handler)) .post("/v1/tracing/event", |r| { testing_api_handler("emit a tracing event", r, post_tracing_event_handler) }) - .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| { - testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler) - }) .get( - "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace", + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage", + |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler), + ) + .get( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace", |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace), ) .any(handler_404)) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 67d798c1d4..7cc0333ee5 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -650,7 +650,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| "pageserver_evictions_with_low_residence_duration", "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \ Residence duration is determined using the `residence_duration_data_source`.", - &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] + &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"] ) .expect("failed to define a metric") }); @@ -714,10 +714,16 @@ impl EvictionsWithLowResidenceDurationBuilder { } } - fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration { + fn build( + &self, + tenant_id: &str, + shard_id: &str, + timeline_id: &str, + ) -> EvictionsWithLowResidenceDuration { let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION .get_metric_with_label_values(&[ tenant_id, + shard_id, timeline_id, self.data_source, &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold), @@ -748,21 +754,24 @@ impl EvictionsWithLowResidenceDuration { pub fn change_threshold( &mut self, tenant_id: &str, + shard_id: &str, timeline_id: &str, new_threshold: Duration, ) { if new_threshold == self.threshold { return; } - let mut with_new = - EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold) - .build(tenant_id, timeline_id); + let mut with_new = EvictionsWithLowResidenceDurationBuilder::new( + self.data_source, + new_threshold, + ) + .build(tenant_id, shard_id, timeline_id); std::mem::swap(self, &mut with_new); - with_new.remove(tenant_id, timeline_id); + with_new.remove(tenant_id, shard_id, timeline_id); } // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`. - fn remove(&mut self, tenant_id: &str, timeline_id: &str) { + fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) { let Some(_counter) = self.counter.take() else { return; }; @@ -771,6 +780,7 @@ impl EvictionsWithLowResidenceDuration { let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[ tenant_id, + shard_id, timeline_id, self.data_source, &threshold, @@ -1603,6 +1613,7 @@ impl StorageTimeMetrics { #[derive(Debug)] pub struct TimelineMetrics { tenant_id: String, + shard_id: String, timeline_id: String, pub flush_time_histo: StorageTimeMetrics, pub compact_time_histo: StorageTimeMetrics, @@ -1623,11 +1634,12 @@ pub struct TimelineMetrics { impl TimelineMetrics { pub fn new( - tenant_id: &TenantId, + tenant_shard_id: &TenantShardId, timeline_id: &TimelineId, evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder, ) -> Self { - let tenant_id = tenant_id.to_string(); + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); let flush_time_histo = StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id); @@ -1664,11 +1676,12 @@ impl TimelineMetrics { let evictions = EVICTIONS .get_metric_with_label_values(&[&tenant_id, &timeline_id]) .unwrap(); - let evictions_with_low_residence_duration = - evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id); + let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder + .build(&tenant_id, &shard_id, &timeline_id); TimelineMetrics { tenant_id, + shard_id, timeline_id, flush_time_histo, compact_time_histo, @@ -1714,6 +1727,7 @@ impl Drop for TimelineMetrics { fn drop(&mut self) { let tenant_id = &self.tenant_id; let timeline_id = &self.timeline_id; + let shard_id = &self.shard_id; let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]); { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); @@ -1727,7 +1741,7 @@ impl Drop for TimelineMetrics { self.evictions_with_low_residence_duration .write() .unwrap() - .remove(tenant_id, timeline_id); + .remove(tenant_id, shard_id, timeline_id); // The following metrics are born outside of the TimelineMetrics lifecycle but still // removed at the end of it. The idea is to have the metrics outlive the diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index dbd85d2dcf..c3c98af406 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -28,7 +28,7 @@ //! Page cache maps from a cache key to a buffer slot. //! The cache key uniquely identifies the piece of data that is being cached. //! -//! The cache key for **materialized pages** is [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`]. +//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`]. //! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access. //! //! The cache key for **immutable file** pages is [`FileId`] and a block number. @@ -83,10 +83,8 @@ use std::{ use anyhow::Context; use once_cell::sync::OnceCell; -use utils::{ - id::{TenantId, TimelineId}, - lsn::Lsn, -}; +use pageserver_api::shard::TenantShardId; +use utils::{id::TimelineId, lsn::Lsn}; use crate::{ context::RequestContext, @@ -154,7 +152,13 @@ enum CacheKey { #[derive(Debug, PartialEq, Eq, Hash, Clone)] struct MaterializedPageHashKey { - tenant_id: TenantId, + /// Why is this TenantShardId rather than TenantId? + /// + /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this + /// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this + /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are + /// special-cased in some other way. + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: Key, } @@ -378,7 +382,7 @@ impl PageCache { /// returned page. pub async fn lookup_materialized_page( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: &Key, lsn: Lsn, @@ -395,7 +399,7 @@ impl PageCache { let mut cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { - tenant_id, + tenant_shard_id, timeline_id, key: *key, }, @@ -436,7 +440,7 @@ impl PageCache { /// pub async fn memorize_materialized_page( &self, - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, key: Key, lsn: Lsn, @@ -444,7 +448,7 @@ impl PageCache { ) -> anyhow::Result<()> { let cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { - tenant_id, + tenant_shard_id, timeline_id, key, }, diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 4270b6edb0..5786356720 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -42,6 +42,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use futures::FutureExt; +use pageserver_api::shard::TenantShardId; use tokio::runtime::Runtime; use tokio::task::JoinHandle; use tokio::task_local; @@ -51,7 +52,7 @@ use tracing::{debug, error, info, warn}; use once_cell::sync::Lazy; -use utils::id::{TenantId, TimelineId}; +use utils::id::TimelineId; use crate::shutdown_pageserver; @@ -317,7 +318,7 @@ struct PageServerTask { /// Tasks may optionally be launched for a particular tenant/timeline, enabling /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`] - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, mutable: Mutex, @@ -329,7 +330,7 @@ struct PageServerTask { pub fn spawn( runtime: &tokio::runtime::Handle, kind: TaskKind, - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, name: &str, shutdown_process_on_error: bool, @@ -345,7 +346,7 @@ where kind, name: name.to_string(), cancel: cancel.clone(), - tenant_id, + tenant_shard_id, timeline_id, mutable: Mutex::new(MutableTaskState { join_handle: None }), }); @@ -424,28 +425,28 @@ async fn task_finish( Ok(Err(err)) => { if shutdown_process_on_error { error!( - "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); shutdown_process = true; } else { error!( - "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); } } Err(err) => { if shutdown_process_on_error { error!( - "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); shutdown_process = true; } else { error!( - "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}", - task_name, task.tenant_id, task.timeline_id, err + "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}", + task_name, task.tenant_shard_id, task.timeline_id, err ); } } @@ -467,11 +468,11 @@ async fn task_finish( /// /// Or to shut down all tasks for given timeline: /// -/// shutdown_tasks(None, Some(tenant_id), Some(timeline_id)) +/// shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id)) /// pub async fn shutdown_tasks( kind: Option, - tenant_id: Option, + tenant_shard_id: Option, timeline_id: Option, ) { let mut victim_tasks = Vec::new(); @@ -480,35 +481,35 @@ pub async fn shutdown_tasks( let tasks = TASKS.lock().unwrap(); for task in tasks.values() { if (kind.is_none() || Some(task.kind) == kind) - && (tenant_id.is_none() || task.tenant_id == tenant_id) + && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id) && (timeline_id.is_none() || task.timeline_id == timeline_id) { task.cancel.cancel(); victim_tasks.push(( Arc::clone(task), task.kind, - task.tenant_id, + task.tenant_shard_id, task.timeline_id, )); } } } - let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none(); + let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none(); - for (task, task_kind, tenant_id, timeline_id) in victim_tasks { + for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks { let join_handle = { let mut task_mut = task.mutable.lock().unwrap(); task_mut.join_handle.take() }; if let Some(mut join_handle) = join_handle { if log_all { - if tenant_id.is_none() { + if tenant_shard_id.is_none() { // there are quite few of these info!(name = task.name, kind = ?task_kind, "stopping global task"); } else { // warn to catch these in tests; there shouldn't be any - warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); + warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over"); } } if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 48f71d7747..a8e8b4cbfa 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -608,7 +608,7 @@ impl Tenant { task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::Attach, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), None, "attach tenant", false, @@ -1917,7 +1917,7 @@ impl Tenant { // // this will additionally shutdown and await all timeline tasks. tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id.tenant_id), None).await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await; // Wait for any in-flight operations to complete self.gate.close().await; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index b8d6d0a321..acd311ace6 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -463,7 +463,7 @@ impl DeleteTenantFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), None, "tenant_delete", false, @@ -550,7 +550,7 @@ impl DeleteTenantFlow { // we encounter an InProgress marker, yield the barrier it contains and wait on it. let barrier = { let mut locked = tenants.write().unwrap(); - let removed = locked.remove(&tenant.tenant_shard_id.tenant_id); + let removed = locked.remove(tenant.tenant_shard_id); // FIXME: we should not be modifying this from outside of mgr.rs. // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 8466fe7fca..4d7bd4259f 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -98,33 +98,6 @@ pub(crate) enum TenantsMap { ShuttingDown(BTreeMap), } -/// Helper for mapping shard-unaware functions to a sharding-aware map -/// TODO(sharding): all users of this must be made shard-aware. -fn exactly_one_or_none<'a>( - map: &'a BTreeMap, - tenant_id: &TenantId, -) -> Option<(&'a TenantShardId, &'a TenantSlot)> { - let mut slots = map.range(TenantShardId::tenant_range(*tenant_id)); - - // Retrieve the first two slots in the range: if both are populated, we must panic because the caller - // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time. - let slot_a = slots.next(); - let slot_b = slots.next(); - match (slot_a, slot_b) { - (None, None) => None, - (Some(slot), None) => { - // Exactly one matching slot - Some(slot) - } - (Some(_slot_a), Some(_slot_b)) => { - // Multiple shards for this tenant: cannot handle this yet. - // TODO(sharding): callers of get() should be shard-aware. - todo!("Attaching multiple shards in teh same tenant to the same pageserver") - } - (None, Some(_)) => unreachable!(), - } -} - pub(crate) enum TenantsMapRemoveResult { Occupied(TenantSlot), Vacant, @@ -147,12 +120,11 @@ impl TenantsMap { /// Convenience function for typical usage, where we want to get a `Tenant` object, for /// working with attached tenants. If the TenantId is in the map but in Secondary state, /// None is returned. - pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc> { + pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc> { match self { TenantsMap::Initializing => None, TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - // TODO(sharding): callers of get() should be shard-aware. - exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached()) + m.get(tenant_shard_id).and_then(|slot| slot.get_attached()) } } } @@ -204,25 +176,19 @@ impl TenantsMap { /// /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded /// slot if the enclosed tenant is shutdown. - pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult { + pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult { use std::collections::btree_map::Entry; match self { TenantsMap::Initializing => TenantsMapRemoveResult::Vacant, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => { - let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k); - match key { - Some(key) => match m.entry(key) { - Entry::Occupied(entry) => match entry.get() { - TenantSlot::InProgress(barrier) => { - TenantsMapRemoveResult::InProgress(barrier.clone()) - } - _ => TenantsMapRemoveResult::Occupied(entry.remove()), - }, - Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, - }, - None => TenantsMapRemoveResult::Vacant, - } - } + TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) { + Entry::Occupied(entry) => match entry.get() { + TenantSlot::InProgress(barrier) => { + TenantsMapRemoveResult::InProgress(barrier.clone()) + } + _ => TenantsMapRemoveResult::Occupied(entry.remove()), + }, + Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant, + }, } } @@ -822,14 +788,16 @@ pub(crate) async fn set_new_tenant_config( new_tenant_conf: TenantConfOpt, tenant_id: TenantId, ) -> Result<(), SetNewTenantConfigError> { + // Legacy API: does not support sharding + let tenant_shard_id = TenantShardId::unsharded(tenant_id); + info!("configuring tenant {tenant_id}"); - let tenant = get_tenant(tenant_id, true)?; + let tenant = get_tenant(tenant_shard_id, true)?; // This is a legacy API that only operates on attached tenants: the preferred // API to use is the location_config/ endpoint, which lets the caller provide // the full LocationConf. let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation); - let tenant_shard_id = TenantShardId::unsharded(tenant_id); Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf) .await @@ -1143,14 +1111,11 @@ pub(crate) enum GetTenantError { /// /// This method is cancel-safe. pub(crate) fn get_tenant( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, active_only: bool, ) -> Result, GetTenantError> { let locked = TENANTS.read().unwrap(); - // TODO(sharding): make all callers of get_tenant shard-aware - let tenant_shard_id = TenantShardId::unsharded(tenant_id); - let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?; match peek_slot { @@ -1162,14 +1127,18 @@ pub(crate) fn get_tenant( TenantState::Active => Ok(Arc::clone(tenant)), _ => { if active_only { - Err(GetTenantError::NotActive(tenant_id)) + Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) } else { Ok(Arc::clone(tenant)) } } }, - Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)), - None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)), + Some(TenantSlot::InProgress(_)) => { + Err(GetTenantError::NotActive(tenant_shard_id.tenant_id)) + } + None | Some(TenantSlot::Secondary) => { + Err(GetTenantError::NotFound(tenant_shard_id.tenant_id)) + } } } @@ -1542,7 +1511,8 @@ pub(crate) enum TenantMapListError { /// /// Get list of tenants, for the mgmt API /// -pub(crate) async fn list_tenants() -> Result, TenantMapListError> { +pub(crate) async fn list_tenants() -> Result, TenantMapListError> +{ let tenants = TENANTS.read().unwrap(); let m = match &*tenants { TenantsMap::Initializing => return Err(TenantMapListError::Initializing), @@ -1550,12 +1520,10 @@ pub(crate) async fn list_tenants() -> Result, Tenan }; Ok(m.iter() .filter_map(|(id, tenant)| match tenant { - TenantSlot::Attached(tenant) => Some((id, tenant.current_state())), + TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())), TenantSlot::Secondary => None, TenantSlot::InProgress(_) => None, }) - // TODO(sharding): make callers of this function shard-aware - .map(|(k, v)| (k.tenant_id, v)) .collect()) } @@ -2089,21 +2057,19 @@ use { }; pub(crate) async fn immediate_gc( - tenant_id: TenantId, + tenant_shard_id: TenantShardId, timeline_id: TimelineId, gc_req: TimelineGcRequest, cancel: CancellationToken, ctx: &RequestContext, ) -> Result>, ApiError> { let guard = TENANTS.read().unwrap(); - let tenant = guard - .get(&tenant_id) - .map(Arc::clone) - .with_context(|| format!("tenant {tenant_id}")) - .map_err(|e| ApiError::NotFound(e.into()))?; - // TODO(sharding): make callers of this function shard-aware - let tenant_shard_id = TenantShardId::unsharded(tenant_id); + let tenant = guard + .get(&tenant_shard_id) + .map(Arc::clone) + .with_context(|| format!("tenant {tenant_shard_id}")) + .map_err(|e| ApiError::NotFound(e.into()))?; let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon()); // Use tenant's pitr setting @@ -2116,9 +2082,9 @@ pub(crate) async fn immediate_gc( task_mgr::spawn( &tokio::runtime::Handle::current(), TaskKind::GarbageCollector, - Some(tenant_id), + Some(tenant_shard_id), Some(timeline_id), - &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"), + &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"), false, async move { fail::fail_point!("immediate_gc_task_pre"); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 03600cf5ae..3765ff6e7a 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -1223,7 +1223,7 @@ impl RemoteTimelineClient { task_mgr::spawn( &self.runtime, TaskKind::RemoteUploadTask, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "remote upload", false, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 126d4d5563..112128ead8 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -837,7 +837,7 @@ impl LayerInner { crate::task_mgr::spawn( &tokio::runtime::Handle::current(), crate::task_mgr::TaskKind::RemoteDownloadTask, - Some(self.desc.tenant_shard_id.tenant_id), + Some(self.desc.tenant_shard_id), Some(self.desc.timeline_id), &task_name, false, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index bc404c41a0..dc23030218 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -87,13 +87,13 @@ pub fn start_background_loops( tenant: &Arc, background_jobs_can_start: Option<&completion::Barrier>, ) { - let tenant_id = tenant.tenant_shard_id.tenant_id; + let tenant_shard_id = tenant.tenant_shard_id; task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Compaction, - Some(tenant_id), + Some(tenant_shard_id), None, - &format!("compactor for tenant {tenant_id}"), + &format!("compactor for tenant {tenant_shard_id}"), false, { let tenant = Arc::clone(tenant); @@ -105,7 +105,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; compaction_loop(tenant, cancel) - .instrument(info_span!("compaction_loop", tenant_id = %tenant_id)) + .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } @@ -114,9 +114,9 @@ pub fn start_background_loops( task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::GarbageCollector, - Some(tenant_id), + Some(tenant_shard_id), None, - &format!("garbage collector for tenant {tenant_id}"), + &format!("garbage collector for tenant {tenant_shard_id}"), false, { let tenant = Arc::clone(tenant); @@ -128,7 +128,7 @@ pub fn start_background_loops( _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {} }; gc_loop(tenant, cancel) - .instrument(info_span!("gc_loop", tenant_id = %tenant_id)) + .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await; Ok(()) } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 551b66b77d..f3907a6d2b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -77,7 +77,7 @@ use postgres_ffi::to_pg_timestamp; use utils::{ completion, generation::Generation, - id::{TenantId, TimelineId}, + id::TimelineId, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, simple_rcu::{Rcu, RcuReadGuard}, @@ -926,7 +926,7 @@ impl Timeline { tracing::debug!("Waiting for WalReceiverManager..."); task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; @@ -977,7 +977,7 @@ impl Timeline { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), ) .await; @@ -995,12 +995,7 @@ impl Timeline { tracing::debug!("Waiting for tasks..."); - task_mgr::shutdown_tasks( - None, - Some(self.tenant_shard_id.tenant_id), - Some(self.timeline_id), - ) - .await; + task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), Some(self.timeline_id)).await; // Finally wait until any gate-holders are complete self.gate.close().await; @@ -1314,16 +1309,20 @@ impl Timeline { &self.conf.default_tenant_conf, ); - // TODO(sharding): make evictions state shard aware - // (https://github.com/neondatabase/neon/issues/5953) let tenant_id_str = self.tenant_shard_id.tenant_id.to_string(); + let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug()); let timeline_id_str = self.timeline_id.to_string(); self.metrics .evictions_with_low_residence_duration .write() .unwrap() - .change_threshold(&tenant_id_str, &timeline_id_str, new_threshold); + .change_threshold( + &tenant_id_str, + &shard_id_str, + &timeline_id_str, + new_threshold, + ); } } @@ -1395,7 +1394,7 @@ impl Timeline { ancestor_lsn: metadata.ancestor_lsn(), metrics: TimelineMetrics::new( - &tenant_shard_id.tenant_id, + &tenant_shard_id, &timeline_id, crate::metrics::EvictionsWithLowResidenceDurationBuilder::new( "mtime", @@ -1496,7 +1495,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::LayerFlushTask, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "layer flush task", false, @@ -1847,7 +1846,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::InitialLogicalSizeCalculation, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "initial size calculation", false, @@ -2020,7 +2019,7 @@ impl Timeline { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::OndemandLogicalSizeCalculation, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "ondemand logical size calculation", false, @@ -2461,13 +2460,7 @@ impl Timeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object let (lsn, read_guard) = cache - .lookup_materialized_page( - self.tenant_shard_id.tenant_id, - self.timeline_id, - key, - lsn, - ctx, - ) + .lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx) .await?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) @@ -3209,7 +3202,7 @@ impl DurationRecorder { #[derive(Default)] struct CompactLevel0Phase1StatsBuilder { version: Option, - tenant_id: Option, + tenant_id: Option, timeline_id: Option, read_lock_acquisition_micros: DurationRecorder, read_lock_held_spawn_blocking_startup_micros: DurationRecorder, @@ -3226,7 +3219,7 @@ struct CompactLevel0Phase1StatsBuilder { #[derive(serde::Serialize)] struct CompactLevel0Phase1Stats { version: u64, - tenant_id: TenantId, + tenant_id: TenantShardId, timeline_id: TimelineId, read_lock_acquisition_micros: RecordedDuration, read_lock_held_spawn_blocking_startup_micros: RecordedDuration, @@ -3745,7 +3738,7 @@ impl Timeline { let ctx = ctx.attached_child(); let mut stats = CompactLevel0Phase1StatsBuilder { version: Some(2), - tenant_id: Some(self.tenant_shard_id.tenant_id), + tenant_id: Some(self.tenant_shard_id), timeline_id: Some(self.timeline_id), ..Default::default() }; @@ -4207,7 +4200,7 @@ impl Timeline { let cache = page_cache::get(); if let Err(e) = cache .memorize_materialized_page( - self.tenant_shard_id.tenant_id, + self.tenant_shard_id, self.timeline_id, key, last_rec_lsn, @@ -4251,7 +4244,7 @@ impl Timeline { let task_id = task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), task_mgr::TaskKind::DownloadAllRemoteLayers, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), "download all remote layers task", false, diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 2a103a7ff4..be873181d9 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -43,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { // Shut down the layer flush task before the remote client, as one depends on the other task_mgr::shutdown_tasks( Some(TaskKind::LayerFlushTask), - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), ) .await; @@ -71,7 +71,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { info!("waiting for timeline tasks to shutdown"); task_mgr::shutdown_tasks( None, - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), ) .await; @@ -528,7 +528,7 @@ impl DeleteTimelineFlow { task_mgr::spawn( task_mgr::BACKGROUND_RUNTIME.handle(), TaskKind::TimelineDeletionWorker, - Some(tenant_shard_id.tenant_id), + Some(tenant_shard_id), Some(timeline_id), "timeline_delete", false, diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 3fe4bc0f83..020c5a9e9f 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -60,7 +60,7 @@ impl Timeline { task_mgr::spawn( BACKGROUND_RUNTIME.handle(), TaskKind::Eviction, - Some(self.tenant_shard_id.tenant_id), + Some(self.tenant_shard_id), Some(self.timeline_id), &format!( "layer eviction for {}/{}", @@ -343,7 +343,7 @@ impl Timeline { // Make one of the tenant's timelines draw the short straw and run the calculation. // The others wait until the calculation is done so that they take into account the // imitated accesses that the winner made. - let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) { + let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) { Ok(t) => t, Err(_) => { return ControlFlow::Break(()); diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 04ff8602d6..e32265afb5 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -30,6 +30,7 @@ use crate::tenant::timeline::walreceiver::connection_manager::{ connection_manager_loop_step, ConnectionManagerState, }; +use pageserver_api::shard::TenantShardId; use std::future::Future; use std::num::NonZeroU64; use std::ops::ControlFlow; @@ -41,7 +42,7 @@ use tokio::sync::watch; use tokio_util::sync::CancellationToken; use tracing::*; -use utils::id::TenantTimelineId; +use utils::id::TimelineId; use self::connection_manager::ConnectionManagerStatus; @@ -60,7 +61,8 @@ pub struct WalReceiverConf { } pub struct WalReceiver { - timeline: TenantTimelineId, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, manager_status: Arc>>, } @@ -71,7 +73,7 @@ impl WalReceiver { mut broker_client: BrokerClientChannel, ctx: &RequestContext, ) -> Self { - let tenant_id = timeline.tenant_shard_id.tenant_id; + let tenant_shard_id = timeline.tenant_shard_id; let timeline_id = timeline.timeline_id; let walreceiver_ctx = ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error); @@ -81,9 +83,9 @@ impl WalReceiver { task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverManager, - Some(tenant_id), + Some(timeline.tenant_shard_id), Some(timeline_id), - &format!("walreceiver for timeline {tenant_id}/{timeline_id}"), + &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"), false, async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -117,11 +119,12 @@ impl WalReceiver { *loop_status.write().unwrap() = None; Ok(()) } - .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id)) + .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id)) ); Self { - timeline: TenantTimelineId::new(tenant_id, timeline_id), + tenant_shard_id, + timeline_id, manager_status, } } @@ -129,8 +132,8 @@ impl WalReceiver { pub async fn stop(self) { task_mgr::shutdown_tasks( Some(TaskKind::WalReceiverManager), - Some(self.timeline.tenant_id), - Some(self.timeline.timeline_id), + Some(self.tenant_shard_id), + Some(self.timeline_id), ) .await; } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 3bcb7ff891..61ab236322 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection( task_mgr::spawn( WALRECEIVER_RUNTIME.handle(), TaskKind::WalReceiverConnectionPoller, - Some(timeline.tenant_shard_id.tenant_id), + Some(timeline.tenant_shard_id), Some(timeline.timeline_id), "walreceiver connection", false, From 6a922b1a7543c41ef38ce8daaa7b4c9da271c158 Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 11 Dec 2023 16:55:43 +0000 Subject: [PATCH 04/18] tests: start adding tests for secondary mode, live migration (#5842) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These tests have been loitering on a branch of mine for a while: they already provide value even without all the secondary mode bits landed yet, and the Workload helper is handy for other tests too. - `Workload` is a re-usable test workload that replaces some of the arbitrary "write a few rows" SQL that I've found my self repeating, and adds a systematic way to append data and check that reads properly reflect the changes. This append+validate stuff is important when doing migrations, as we want to detect situations where we might be reading from a pageserver that has not properly seen latest changes. - test_multi_attach is a validation of how the pageserver handles attaching the same tenant to multiple pageservers, from a safety point of view. This is intentionally separate from the larger testing of migration, to provide an isolated environment for multi-attachment. - test_location_conf_churn is a pseudo-random walk through the various states that TenantSlot can be put into, with validation that attached tenants remain externally readable when they should, and as a side effect validating that the compute endpoint's online configuration changes work as expected. - test_live_migration is the reference implementation of how to drive a pair of pageservers through a zero-downtime migration of a tenant. --------- Co-authored-by: Arpad Müller --- test_runner/fixtures/neon_fixtures.py | 27 +- test_runner/fixtures/pageserver/http.py | 19 +- test_runner/fixtures/workload.py | 148 ++++++++ .../regress/test_pageserver_generations.py | 103 +++++- .../regress/test_pageserver_secondary.py | 332 ++++++++++++++++++ 5 files changed, 623 insertions(+), 6 deletions(-) create mode 100644 test_runner/fixtures/workload.py create mode 100644 test_runner/regress/test_pageserver_secondary.py diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index c569b63d4e..fb6cea5713 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1712,7 +1712,7 @@ class NeonPageserver(PgProtocol): @property def workdir(self) -> Path: - return Path(os.path.join(self.env.repo_dir, f"pageserver_{self.id}")) + return self.env.repo_dir / f"pageserver_{self.id}" def assert_no_errors(self): logfile = self.workdir / "pageserver.log" @@ -1784,6 +1784,27 @@ class NeonPageserver(PgProtocol): client = self.http_client() return client.tenant_detach(tenant_id) + def tenant_location_configure(self, tenant_id: TenantId, config: dict[str, Any], **kwargs): + # This API is only for use when generations are enabled + assert self.env.attachment_service is not None + + if config["mode"].startswith("Attached") and "generation" not in config: + config["generation"] = self.env.attachment_service.attach_hook_issue(tenant_id, self.id) + + client = self.http_client() + return client.tenant_location_conf(tenant_id, config, **kwargs) + + def read_tenant_location_conf(self, tenant_id: TenantId) -> dict[str, Any]: + path = self.tenant_dir(tenant_id) / "config-v1" + log.info(f"Reading location conf from {path}") + bytes = open(path, "r").read() + try: + decoded: dict[str, Any] = toml.loads(bytes) + return decoded + except: + log.error(f"Failed to decode LocationConf, raw content ({len(bytes)} bytes): {bytes}") + raise + def tenant_create( self, tenant_id: TenantId, @@ -2717,6 +2738,7 @@ class EndpointFactory: lsn: Optional[Lsn] = None, hot_standby: bool = False, config_lines: Optional[List[str]] = None, + pageserver_id: Optional[int] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -2736,6 +2758,7 @@ class EndpointFactory: lsn=lsn, hot_standby=hot_standby, config_lines=config_lines, + pageserver_id=pageserver_id, ) def stop_all(self) -> "EndpointFactory": @@ -3082,7 +3105,7 @@ def pytest_addoption(parser: Parser): SMALL_DB_FILE_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg] - r"config|metadata|.+\.(?:toml|pid|json|sql)" + r"config|config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql)" ) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 3e75bac424..b46ddf5527 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -150,7 +150,7 @@ class PageserverHttpClient(requests.Session): # (this may change in future if we do fault injection of a kind that causes # requests TCP flows to stick) read=False, - backoff_factor=0, + backoff_factor=0.2, status_forcelist=[503], allowed_methods=None, remove_headers_on_redirect=[], @@ -277,6 +277,23 @@ class PageserverHttpClient(requests.Session): res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params) self.verbose_error(res) + def tenant_location_conf( + self, tenant_id: TenantId, location_conf=dict[str, Any], flush_ms=None + ): + body = location_conf.copy() + body["tenant_id"] = str(tenant_id) + + params = {} + if flush_ms is not None: + params["flush_ms"] = str(flush_ms) + + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/location_config", + json=body, + params=params, + ) + self.verbose_error(res) + def tenant_delete(self, tenant_id: TenantId): res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}") self.verbose_error(res) diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py new file mode 100644 index 0000000000..241531437c --- /dev/null +++ b/test_runner/fixtures/workload.py @@ -0,0 +1,148 @@ +from typing import Optional + +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + last_flush_lsn_upload, + wait_for_last_flush_lsn, +) +from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.types import TenantId, TimelineId + + +class Workload: + """ + This is not a general purpose load generator: it exists for storage tests that need to inject some + high level types of storage work via the postgres interface: + - layer writes (`write_rows`) + - work for compaction (`churn_rows`) + - reads, checking we get the right data (`validate`) + """ + + def __init__(self, env: NeonEnv, tenant_id: TenantId, timeline_id: TimelineId): + self.env = env + self.tenant_id = tenant_id + self.timeline_id = timeline_id + self.table = "foo" + + self.expect_rows = 0 + self.churn_cursor = 0 + + self._endpoint: Optional[Endpoint] = None + + def endpoint(self, pageserver_id: int) -> Endpoint: + if self._endpoint is None: + self._endpoint = self.env.endpoints.create( + "main", + tenant_id=self.tenant_id, + pageserver_id=pageserver_id, + endpoint_id="ep-workload", + ) + self._endpoint.start(pageserver_id=pageserver_id) + else: + self._endpoint.reconfigure(pageserver_id=pageserver_id) + + connstring = self._endpoint.safe_psql( + "SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'" + ) + log.info(f"Workload.endpoint: connstr={connstring}") + + return self._endpoint + + def __del__(self): + if self._endpoint is not None: + self._endpoint.stop() + + def init(self, pageserver_id: int): + endpoint = self.endpoint(pageserver_id) + + endpoint.safe_psql(f"CREATE TABLE {self.table} (id INTEGER PRIMARY KEY, val text);") + endpoint.safe_psql("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") + last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + + def write_rows(self, n, pageserver_id): + endpoint = self.endpoint(pageserver_id) + start = self.expect_rows + end = start + n - 1 + self.expect_rows += n + dummy_value = "blah" + endpoint.safe_psql( + f""" + INSERT INTO {self.table} (id, val) + SELECT g, '{dummy_value}' + FROM generate_series({start}, {end}) g + """ + ) + + return last_flush_lsn_upload( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + + def churn_rows(self, n, pageserver_id, upload=True): + assert self.expect_rows >= n + + max_iters = 10 + endpoint = self.endpoint(pageserver_id) + todo = n + i = 0 + while todo > 0: + i += 1 + if i > max_iters: + raise RuntimeError("oops") + start = self.churn_cursor % self.expect_rows + n_iter = min((self.expect_rows - start), todo) + todo -= n_iter + + end = start + n_iter - 1 + + log.info( + f"start,end = {start},{end}, cursor={self.churn_cursor}, expect_rows={self.expect_rows}" + ) + + assert end < self.expect_rows + + self.churn_cursor += n_iter + dummy_value = "blah" + endpoint.safe_psql_many( + [ + f""" + INSERT INTO {self.table} (id, val) + SELECT g, '{dummy_value}' + FROM generate_series({start}, {end}) g + ON CONFLICT (id) DO UPDATE + SET val = EXCLUDED.val + """, + f"VACUUM {self.table}", + ] + ) + + last_flush_lsn = wait_for_last_flush_lsn( + self.env, endpoint, self.tenant_id, self.timeline_id, pageserver_id=pageserver_id + ) + ps_http = self.env.get_pageserver(pageserver_id).http_client() + wait_for_last_record_lsn(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) + + if upload: + # force a checkpoint to trigger upload + ps_http.timeline_checkpoint(self.tenant_id, self.timeline_id) + wait_for_upload(ps_http, self.tenant_id, self.timeline_id, last_flush_lsn) + log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") + else: + log.info(f"Churn: not waiting for upload, disk LSN {last_flush_lsn}") + + def validate(self, pageserver_id): + endpoint = self.endpoint(pageserver_id) + result = endpoint.safe_psql_many( + [ + "select clear_buffer_cache()", + f""" + SELECT COUNT(*) FROM {self.table} + """, + ] + ) + + log.info(f"validate({self.expect_rows}): {result}") + assert result == [[("",)], [(self.expect_rows,)]] diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 66cc286aba..4488be31c5 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -23,14 +23,20 @@ from fixtures.neon_fixtures import ( PgBin, S3Scrubber, last_flush_lsn_upload, - wait_for_last_flush_lsn, ) -from fixtures.pageserver.utils import list_prefix +from fixtures.pageserver.http import PageserverApiException +from fixtures.pageserver.utils import ( + assert_tenant_state, + list_prefix, + wait_for_last_record_lsn, + wait_for_upload, +) from fixtures.remote_storage import ( RemoteStorageKind, ) from fixtures.types import TenantId, TimelineId from fixtures.utils import print_gc_result, wait_until +from fixtures.workload import Workload # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -93,7 +99,10 @@ def generate_uploads_and_deletions( ) assert tenant_id is not None assert timeline_id is not None - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) + # We are waiting for uploads as well as local flush, in order to avoid leaving the system + # in a state where there are "future layers" in remote storage that will generate deletions + # after a restart. + last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id) ps_http.timeline_checkpoint(tenant_id, timeline_id) # Compaction should generate some GC-elegible layers @@ -560,3 +569,91 @@ def test_eviction_across_generations(neon_env_builder: NeonEnvBuilder): read_all(env, tenant_id, timeline_id) evict_all_layers(env, tenant_id, timeline_id) read_all(env, tenant_id, timeline_id) + + +def test_multi_attach( + neon_env_builder: NeonEnvBuilder, + pg_bin: PgBin, +): + neon_env_builder.enable_generations = True + neon_env_builder.num_pageservers = 3 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + + pageservers = env.pageservers + http_clients = list([p.http_client() for p in pageservers]) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # We will intentionally create situations where stale deletions happen from non-latest-generation + # nodes when the tenant is multiply-attached + for ps in env.pageservers: + ps.allowed_errors.extend( + [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] + ) + + # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) + _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + http_clients[1].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + http_clients[2].timeline_detail(tenant_id, timeline_id) + + workload = Workload(env, tenant_id, timeline_id) + workload.init(pageservers[0].id) + workload.write_rows(1000, pageservers[0].id) + + # Attach the tenant to the other two pageservers + pageservers[1].tenant_attach(env.initial_tenant) + pageservers[2].tenant_attach(env.initial_tenant) + + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active")) + wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active")) + + # Now they all have it attached + _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients]) + _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) + _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + + # The endpoint can use any pageserver to service its reads + for pageserver in pageservers: + workload.validate(pageserver.id) + + # If we write some more data, all the nodes can see it, including stale ones + wrote_lsn = workload.write_rows(1000, pageservers[0].id) + for ps_http in http_clients: + wait_for_last_record_lsn(ps_http, tenant_id, timeline_id, wrote_lsn) + + # ...and indeed endpoints can see it via any of the pageservers + for pageserver in pageservers: + workload.validate(pageserver.id) + + # Prompt all the pageservers, including stale ones, to upload ingested layers to remote storage + for ps_http in http_clients: + ps_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload(ps_http, tenant_id, timeline_id, wrote_lsn) + + # Now, the contents of remote storage will be a set of layers from each pageserver, but with unique + # generation numbers + # TODO: validate remote storage contents + + # Stop all pageservers + for ps in pageservers: + ps.stop() + + # Returning to a normal healthy state: all pageservers will start, but only the one most + # recently attached via the control plane will re-attach on startup + for ps in pageservers: + ps.start() + + with pytest.raises(PageserverApiException): + _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) + with pytest.raises(PageserverApiException): + _detail = http_clients[1].timeline_detail(tenant_id, timeline_id) + _detail = http_clients[2].timeline_detail(tenant_id, timeline_id) + + # All data we wrote while multi-attached remains readable + workload.validate(pageservers[2].id) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py new file mode 100644 index 0000000000..b14b7f1328 --- /dev/null +++ b/test_runner/regress/test_pageserver_secondary.py @@ -0,0 +1,332 @@ +import random +from typing import Any, Dict, Optional + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver +from fixtures.remote_storage import RemoteStorageKind +from fixtures.types import TenantId, TimelineId +from fixtures.utils import wait_until +from fixtures.workload import Workload + +# A tenant configuration that is convenient for generating uploads and deletions +# without a large amount of postgres traffic. +TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_target_size": f"{128 * 1024}", + "compaction_threshold": "1", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", +} + + +def evict_random_layers( + rng: random.Random, pageserver: NeonPageserver, tenant_id: TenantId, timeline_id: TimelineId +): + """ + Evict 50% of the layers on a pageserver + """ + timeline_path = pageserver.timeline_dir(tenant_id, timeline_id) + initial_local_layers = sorted( + list(filter(lambda path: path.name != "metadata", timeline_path.glob("*"))) + ) + client = pageserver.http_client() + for layer in initial_local_layers: + if "ephemeral" in layer.name or "temp_download" in layer.name: + continue + + if rng.choice([True, False]): + log.info(f"Evicting layer {tenant_id}/{timeline_id} {layer.name}") + client.evict_layer(tenant_id=tenant_id, timeline_id=timeline_id, layer_name=layer.name) + + +@pytest.mark.parametrize("seed", [1, 2, 3]) +def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): + """ + Issue many location configuration changes, ensure that tenants + remain readable & we don't get any unexpected errors. We should + have no ERROR in the log, and no 500s in the API. + + The location_config API is intentionally designed so that all destination + states are valid, so that we may test it in this way: the API should always + work as long as the tenant exists. + """ + neon_env_builder.enable_generations = True + neon_env_builder.num_pageservers = 3 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + pageservers = env.pageservers + list([p.http_client() for p in pageservers]) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + # We will make no effort to avoid stale attachments + for ps in env.pageservers: + ps.allowed_errors.extend( + [ + ".*Dropped remote consistent LSN updates.*", + ".*Dropping stale deletions.*", + # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found + ".*query handler.*Tenant.*not found.*", + # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active + ".*query handler.*Tenant.*not active.*", + ] + ) + + # these can happen, if we shutdown at a good time. to be fixed as part of #5172. + message = ".*duplicated L1 layer layer=.*" + ps.allowed_errors.append(message) + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, env.pageservers[0].id) + + # We use a fixed seed to make the test reproducible: we want a randomly + # chosen order, but not to change the order every time we run the test. + rng = random.Random(seed) + + initial_generation = 1 + last_state = { + env.pageservers[0].id: ("AttachedSingle", initial_generation), + env.pageservers[1].id: ("Detached", None), + env.pageservers[2].id: ("Detached", None), + } + + latest_attached = env.pageservers[0].id + + for _i in range(0, 64): + # Pick a pageserver + pageserver = rng.choice(env.pageservers) + + # Pick a pseudorandom state + modes = [ + "AttachedSingle", + "AttachedMulti", + "AttachedStale", + "Secondary", + "Detached", + "_Evictions", + "_Restart", + ] + + mode = rng.choice(modes) + + last_state_ps = last_state[pageserver.id] + if mode == "_Evictions": + if last_state_ps[0].startswith("Attached"): + log.info(f"Action: evictions on pageserver {pageserver.id}") + evict_random_layers(rng, pageserver, tenant_id, timeline_id) + else: + log.info( + f"Action: skipping evictions on pageserver {pageserver.id}, is not attached" + ) + elif mode == "_Restart": + log.info(f"Action: restarting pageserver {pageserver.id}") + pageserver.stop() + pageserver.start() + if last_state_ps[0].startswith("Attached") and latest_attached == pageserver.id: + log.info("Entering postgres...") + workload.churn_rows(rng.randint(128, 256), pageserver.id) + workload.validate(pageserver.id) + elif last_state_ps[0].startswith("Attached"): + # The `attachment_service` will only re-attach on startup when a pageserver was the + # holder of the latest generation: otherwise the pageserver will revert to detached + # state if it was running attached with a stale generation + last_state[pageserver.id] = ("Detached", None) + else: + secondary_conf: Optional[Dict[str, Any]] = None + if mode == "Secondary": + secondary_conf = {"warm": rng.choice([True, False])} + + location_conf: Dict[str, Any] = { + "mode": mode, + "secondary_conf": secondary_conf, + "tenant_conf": {}, + } + + log.info(f"Action: Configuring pageserver {pageserver.id} to {location_conf}") + + # Select a generation number + if mode.startswith("Attached"): + if last_state_ps[1] is not None: + if rng.choice([True, False]): + # Move between attached states, staying in the same generation + generation = last_state_ps[1] + else: + # Switch generations, while also jumping between attached states + generation = env.attachment_service.attach_hook_issue( + tenant_id, pageserver.id + ) + latest_attached = pageserver.id + else: + generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver.id) + latest_attached = pageserver.id + else: + generation = None + + location_conf["generation"] = generation + + pageserver.tenant_location_configure(tenant_id, location_conf) + last_state[pageserver.id] = (mode, generation) + + if mode.startswith("Attached"): + # This is a basic test: we are validating that he endpoint works properly _between_ + # configuration changes. A stronger test would be to validate that clients see + # no errors while we are making the changes. + workload.churn_rows( + rng.randint(128, 256), pageserver.id, upload=mode != "AttachedStale" + ) + workload.validate(pageserver.id) + + # Attach all pageservers + for ps in env.pageservers: + location_conf = {"mode": "AttachedMulti", "secondary_conf": None, "tenant_conf": {}} + ps.tenant_location_configure(tenant_id, location_conf) + + # Confirm that all are readable + for ps in env.pageservers: + workload.validate(ps.id) + + # Detach all pageservers + for ps in env.pageservers: + location_conf = {"mode": "Detached", "secondary_conf": None, "tenant_conf": {}} + ps.tenant_location_configure(tenant_id, location_conf) + + # Confirm that all local disk state was removed on detach + # TODO + + +def test_live_migration(neon_env_builder: NeonEnvBuilder): + """ + Test the sequence of location states that are used in a live migration. + """ + neon_env_builder.enable_generations = True + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) + assert env.attachment_service is not None + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + pageserver_a = env.pageservers[0] + pageserver_b = env.pageservers[1] + + initial_generation = 1 + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageservers[0].id) + workload.write_rows(256, env.pageservers[0].id) + + # Make the destination a secondary location + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + + workload.churn_rows(64, pageserver_a.id, upload=False) + + # Set origin attachment to stale + log.info("Setting origin to AttachedStale") + pageserver_a.tenant_location_configure( + tenant_id, + { + "mode": "AttachedStale", + "secondary_conf": None, + "tenant_conf": {}, + "generation": initial_generation, + }, + flush_ms=5000, + ) + + migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id) + log.info(f"Acquired generation {migrated_generation} for destination pageserver") + assert migrated_generation == initial_generation + 1 + + # Writes and reads still work in AttachedStale. + workload.validate(pageserver_a.id) + + # TODO: call into secondary mode API hooks to do an upload/download sync + + # Generate some more dirty writes: we expect the origin to ingest WAL in + # in AttachedStale + workload.churn_rows(64, pageserver_a.id, upload=False) + workload.validate(pageserver_a.id) + + # Attach the destination + log.info("Setting destination to AttachedMulti") + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "AttachedMulti", + "secondary_conf": None, + "tenant_conf": {}, + "generation": migrated_generation, + }, + ) + + # Wait for destination LSN to catch up with origin + origin_lsn = pageserver_a.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + + def caught_up(): + destination_lsn = pageserver_b.http_client().timeline_detail(tenant_id, timeline_id)[ + "last_record_lsn" + ] + log.info( + f"Waiting for LSN to catch up: origin {origin_lsn} vs destination {destination_lsn}" + ) + assert destination_lsn >= origin_lsn + + wait_until(100, 0.1, caught_up) + + # The destination should accept writes + workload.churn_rows(64, pageserver_b.id) + + # Dual attached: both are readable. + workload.validate(pageserver_a.id) + workload.validate(pageserver_b.id) + + # Revert the origin to secondary + log.info("Setting origin to Secondary") + pageserver_a.tenant_location_configure( + tenant_id, + { + "mode": "Secondary", + "secondary_conf": {"warm": True}, + "tenant_conf": {}, + }, + ) + + workload.churn_rows(64, pageserver_b.id) + + # Put the destination into final state + pageserver_b.tenant_location_configure( + tenant_id, + { + "mode": "AttachedSingle", + "secondary_conf": None, + "tenant_conf": {}, + "generation": migrated_generation, + }, + ) + + workload.churn_rows(64, pageserver_b.id) + workload.validate(pageserver_b.id) From 036558c956da0833776a68a953e9cff443442a83 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 11 Dec 2023 10:25:43 -0600 Subject: [PATCH 05/18] Fix git ownership issue in check-codestyle-rust-arm We have this workaround for other jobs. Looks like this one was forgotten about. --- .github/workflows/neon_extra_builds.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 0d7db8dfbc..09a106fb52 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -238,6 +238,20 @@ jobs: options: --init steps: + - name: Fix git ownership + run: | + # Workaround for `fatal: detected dubious ownership in repository at ...` + # + # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers + # Ref https://github.com/actions/checkout/issues/785 + # + git config --global --add safe.directory ${{ github.workspace }} + git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done + - name: Checkout uses: actions/checkout@v4 with: From 5ab9592a2dd2203f8f17e97ad6b69b28e63fe383 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Mon, 11 Dec 2023 10:46:41 -0600 Subject: [PATCH 06/18] Add submodule paths as safe directories as a precaution The check-codestyle-rust-arm job requires this for some reason, so let's just add them everywhere we do this workaround. --- .github/workflows/build_and_test.yml | 8 ++++++++ .github/workflows/neon_extra_builds.yml | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 820848b4fb..693ed1a66f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -199,6 +199,10 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done - name: Checkout uses: actions/checkout@v3 @@ -1097,6 +1101,10 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done - name: Checkout uses: actions/checkout@v3 diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 09a106fb52..b1ea5e4f74 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -142,6 +142,10 @@ jobs: # git config --global --add safe.directory ${{ github.workspace }} git config --global --add safe.directory ${GITHUB_WORKSPACE} + for r in 14 15 16; do + git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r" + git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r" + done - name: Checkout uses: actions/checkout@v4 From c49fd69bd6f7f109f3c9100ee8544d132ccee020 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 11 Dec 2023 22:08:14 +0100 Subject: [PATCH 07/18] Add initdb_lsn to TimelineInfo (#6104) This way, we can query it. Background: I want to do statistics for how reproducible `initdb_lsn` really is, see https://github.com/neondatabase/cloud/issues/8284 and https://neondb.slack.com/archives/C036U0GRMRB/p1701895218280269 --- libs/pageserver_api/src/models.rs | 3 +++ pageserver/src/http/routes.rs | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 2572bcf74f..a3029e67a5 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -385,6 +385,9 @@ pub struct TimelineInfo { /// The LSN that we are advertizing to safekeepers pub remote_consistent_lsn_visible: Lsn, + /// The LSN from the start of the root timeline (never changes) + pub initdb_lsn: Lsn, + pub current_logical_size: u64, pub current_logical_size_is_accurate: bool, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b9b0250671..fee50460a5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -319,6 +319,7 @@ async fn build_timeline_info_common( ctx: &RequestContext, ) -> anyhow::Result { crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id(); + let initdb_lsn = timeline.initdb_lsn; let last_record_lsn = timeline.get_last_record_lsn(); let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = { let guard = timeline.last_received_wal.lock().unwrap(); @@ -359,6 +360,7 @@ async fn build_timeline_info_common( disk_consistent_lsn: timeline.get_disk_consistent_lsn(), remote_consistent_lsn: remote_consistent_lsn_projected, remote_consistent_lsn_visible, + initdb_lsn, last_record_lsn, prev_record_lsn: Some(timeline.get_prev_record_lsn()), latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(), @@ -506,7 +508,7 @@ async fn timeline_list_handler( } Ok::, ApiError>(response_data) } - .instrument(info_span!("timeline_list", + .instrument(info_span!("timeline_list", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await?; @@ -545,7 +547,7 @@ async fn timeline_detail_handler( Ok::<_, ApiError>(timeline_info) } - .instrument(info_span!("timeline_detail", + .instrument(info_span!("timeline_detail", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) @@ -843,7 +845,7 @@ async fn tenant_status( attachment_status: state.attachment_status(), }) } - .instrument(info_span!("tenant_status_handler", + .instrument(info_span!("tenant_status_handler", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug())) .await?; From 3b04f3a7490562999397ed4263fb7cd0a817addb Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 11 Dec 2023 23:27:53 +0200 Subject: [PATCH 08/18] fix: accidential return Ok (#6106) Error indicating request cancellation OR timeline shutdown was deemed as a reason to exit the background worker that calculated synthetic size. Fix it to only be considered for avoiding logging such of such errors. --- pageserver/src/consumption_metrics.rs | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index bb13bdd5e5..8f2b88d191 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -287,14 +287,20 @@ async fn calculate_synthetic_size_worker( // By using the same limiter, we centralize metrics collection for "start" and "finished" counters, // which turns out is really handy to understand the system. if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await { - if let Some(PageReconstructError::Cancelled) = - e.downcast_ref::() - { - return Ok(()); - } - error!( - "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}" + // this error can be returned if timeline is shutting down, but it does not + // mean the synthetic size worker should terminate. we do not need any checks + // in this function because `mgr::get_tenant` will error out after shutdown has + // progressed to shutting down tenants. + let is_cancelled = matches!( + e.downcast_ref::(), + Some(PageReconstructError::Cancelled) ); + + if !is_cancelled { + error!( + "failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}" + ); + } } } } @@ -307,7 +313,7 @@ async fn calculate_synthetic_size_worker( let res = tokio::time::timeout_at( started_at + synthetic_size_calculation_interval, - task_mgr::shutdown_token().cancelled(), + cancel.cancelled(), ) .await; if res.is_ok() { From 20e9cf7d3131357b2ae9fa702795a6981e1347a9 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 12 Dec 2023 07:19:59 +0000 Subject: [PATCH 09/18] pageserver: tweaks to slow/hung task logging (#6098) ## Problem - `shutdown_tasks` would log when a particular task was taking a long time to shut down, but not when it eventually completed. That left one uncertain as to whether the slow task was the source of a hang, or just a precursor. ## Summary of changes - Add a log line after a slow task shutdown - Add an equivalent in Gate's `warn_if_stuck`, in case we ever need it. This isn't related to the original issue but was noticed when checking through these logging paths. --- libs/utils/src/sync/gate.rs | 18 ++++++++++++++++-- pageserver/src/task_mgr.rs | 3 ++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs index 9aad0af22d..31c76d2f74 100644 --- a/libs/utils/src/sync/gate.rs +++ b/libs/utils/src/sync/gate.rs @@ -30,18 +30,32 @@ async fn warn_if_stuck( let mut fut = std::pin::pin!(fut); - loop { + let mut warned = false; + let ret = loop { match tokio::time::timeout(warn_period, &mut fut).await { - Ok(ret) => return ret, + Ok(ret) => break ret, Err(_) => { tracing::warn!( gate = name, elapsed_ms = started.elapsed().as_millis(), "still waiting, taking longer than expected..." ); + warned = true; } } + }; + + // If we emitted a warning for slowness, also emit a message when we complete, so that + // someone debugging a shutdown can know for sure whether we have moved past this operation. + if warned { + tracing::info!( + gate = name, + elapsed_ms = started.elapsed().as_millis(), + "completed, after taking longer than expected" + ) } + + ret } #[derive(Debug)] diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 5786356720..8747d9ad50 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -518,12 +518,13 @@ pub async fn shutdown_tasks( { // allow some time to elapse before logging to cut down the number of log // lines. - info!("waiting for {} to shut down", task.name); + info!("waiting for task {} to shut down", task.name); // we never handled this return value, but: // - we don't deschedule which would lead to is_cancelled // - panics are already logged (is_panicked) // - task errors are already logged in the wrapper let _ = join_handle.await; + info!("task {} completed", task.name); } } else { // Possibly one of: From fead836f26aa8dc93a9876812438e5d81ca23b42 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 12 Dec 2023 10:39:34 +0000 Subject: [PATCH 10/18] swagger: remove 'format: hex' from tenant IDs (#6099) ## Problem TenantId is changing to TenantShardId in many APIs. The swagger had `format: hex` attributes on some of these IDs. That isn't formally defined anywhere, but a reasonable person might think it means "hex digits only", which will no longer be the case once we start using shard-aware IDs (they're like `-0001`). ## Summary of changes - Remove these `format` attributes from all `tenant_id` fields in the swagger definition --- pageserver/src/http/openapi_spec.yml | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 237109abfe..9422ccb2fd 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -84,7 +84,6 @@ paths: required: true schema: type: string - format: hex get: description: Get tenant status responses: @@ -181,7 +180,6 @@ paths: required: true schema: type: string - format: hex get: description: Get timelines for tenant responses: @@ -232,7 +230,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -338,7 +335,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -401,7 +397,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -469,7 +464,6 @@ paths: required: true schema: type: string - format: hex - name: timeline_id in: path required: true @@ -523,7 +517,6 @@ paths: required: true schema: type: string - format: hex post: description: | Schedules attach operation to happen in the background for the given tenant. @@ -631,7 +624,6 @@ paths: required: true schema: type: string - format: hex - name: flush_ms in: query required: false @@ -724,7 +716,6 @@ paths: required: true schema: type: string - format: hex - name: detach_ignored in: query required: false @@ -784,7 +775,6 @@ paths: required: true schema: type: string - format: hex post: description: | Remove tenant data (including all corresponding timelines) from pageserver's memory. @@ -833,7 +823,6 @@ paths: required: true schema: type: string - format: hex post: description: | Schedules an operation that attempts to load a tenant from the local disk and @@ -890,7 +879,6 @@ paths: required: true schema: type: string - format: hex get: description: | Calculate tenant's synthetic size @@ -933,7 +921,6 @@ paths: required: true schema: type: string - format: hex - name: inputs_only in: query required: false @@ -1003,7 +990,6 @@ paths: required: true schema: type: string - format: hex post: description: | Create a timeline. Returns new timeline id on success.\ @@ -1137,7 +1123,6 @@ paths: application/json: schema: type: string - format: hex "400": description: Malformed tenant create request content: @@ -1234,7 +1219,6 @@ paths: required: true schema: type: string - format: hex get: description: | Returns tenant's config description: specific config overrides a tenant has @@ -1340,7 +1324,6 @@ components: properties: new_tenant_id: type: string - format: hex generation: type: integer description: Attachment generation number. @@ -1369,7 +1352,6 @@ components: properties: tenant_id: type: string - format: hex TenantLocationConfigRequest: type: object required: @@ -1377,7 +1359,6 @@ components: properties: tenant_id: type: string - format: hex mode: type: string enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"] @@ -1446,7 +1427,6 @@ components: format: hex tenant_id: type: string - format: hex last_record_lsn: type: string format: hex From 9e071e445814dd57af30eb589f61beec4a746c54 Mon Sep 17 00:00:00 2001 From: Anna Khanova <32508607+khanova@users.noreply.github.com> Date: Tue, 12 Dec 2023 12:42:51 +0100 Subject: [PATCH 11/18] Propagate information about the protocol to console (#6102) ## Problem In snowflake logs currently there is no information about the protocol, that the client uses. ## Summary of changes Propagate the information about the protocol together with the app_name. In format: `{app_name}/{sql_over_http/tcp/ws}`. This will give to @stepashka more observability on what our clients are using. --- proxy/src/auth/backend.rs | 10 +++++----- proxy/src/console/provider.rs | 12 ++++++------ proxy/src/console/provider/mock.rs | 6 +++--- proxy/src/console/provider/neon.rs | 20 ++++++++++---------- proxy/src/proxy.rs | 13 ++++++++----- proxy/src/proxy/tests.rs | 4 ++-- proxy/src/serverless/conn_pool.rs | 4 ++-- 7 files changed, 36 insertions(+), 33 deletions(-) diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 649b3f40f2..ba054b53eb 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -166,7 +166,7 @@ impl TryFrom for ComputeUserInfo { /// All authentication flows will emit an AuthenticationOk message if successful. async fn auth_quirks( api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: ClientCredentials, client: &mut stream::PqStream>, allow_cleartext: bool, @@ -235,7 +235,7 @@ async fn auth_quirks( /// only if authentication was successfuly. async fn auth_and_wake_compute( api: &impl console::Api, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: ClientCredentials, client: &mut stream::PqStream>, allow_cleartext: bool, @@ -314,7 +314,7 @@ impl<'a> BackendType<'a, ClientCredentials> { #[tracing::instrument(fields(allow_cleartext = allow_cleartext), skip_all)] pub async fn authenticate( self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, client: &mut stream::PqStream>, allow_cleartext: bool, config: &'static AuthenticationConfig, @@ -387,7 +387,7 @@ impl<'a> BackendType<'a, ClientCredentials> { impl BackendType<'_, ComputeUserInfo> { pub async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, ) -> Result>, GetAuthInfoError> { use BackendType::*; match self { @@ -404,7 +404,7 @@ impl BackendType<'_, ComputeUserInfo> { /// The link auth flow doesn't support this, so we return [`None`] in that case. pub async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, ) -> Result, console::errors::WakeComputeError> { use BackendType::*; diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index b0a73fd03d..deab966d9e 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -196,15 +196,15 @@ pub mod errors { } /// Extra query params we'd like to pass to the console. -pub struct ConsoleReqExtra<'a> { +pub struct ConsoleReqExtra { /// A unique identifier for a connection. pub session_id: uuid::Uuid, /// Name of client application, if set. - pub application_name: Option<&'a str>, + pub application_name: String, pub options: Vec<(String, String)>, } -impl<'a> ConsoleReqExtra<'a> { +impl ConsoleReqExtra { // https://swagger.io/docs/specification/serialization/ DeepObject format // paramName[prop1]=value1¶mName[prop2]=value2&.... pub fn options_as_deep_object(&self) -> Vec<(String, String)> { @@ -259,20 +259,20 @@ pub trait Api { /// Get the client's auth secret for authentication. async fn get_auth_info( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result; async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result>, errors::GetAuthInfoError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result; } diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs index 8aad8c06bc..c464b4daf2 100644 --- a/proxy/src/console/provider/mock.rs +++ b/proxy/src/console/provider/mock.rs @@ -144,7 +144,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_auth_info( &self, - _extra: &ConsoleReqExtra<'_>, + _extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { self.do_get_auth_info(creds).await @@ -152,7 +152,7 @@ impl super::Api for Api { async fn get_allowed_ips( &self, - _extra: &ConsoleReqExtra<'_>, + _extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result>, GetAuthInfoError> { Ok(Arc::new(self.do_get_auth_info(creds).await?.allowed_ips)) @@ -161,7 +161,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - _extra: &ConsoleReqExtra<'_>, + _extra: &ConsoleReqExtra, _creds: &ComputeUserInfo, ) -> Result { self.do_wake_compute() diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index f8c3ee5b58..192252a0df 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -48,7 +48,7 @@ impl Api { async fn do_get_auth_info( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); @@ -60,9 +60,9 @@ impl Api { .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", extra.session_id)]) .query(&[ - ("application_name", extra.application_name), - ("project", Some(&creds.endpoint)), - ("role", Some(&creds.inner.user)), + ("application_name", extra.application_name.as_str()), + ("project", creds.endpoint.as_str()), + ("role", creds.inner.user.as_str()), ]) .build()?; @@ -101,7 +101,7 @@ impl Api { async fn do_wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { let request_id = uuid::Uuid::new_v4().to_string(); @@ -113,8 +113,8 @@ impl Api { .header("Authorization", format!("Bearer {}", &self.jwt)) .query(&[("session_id", extra.session_id)]) .query(&[ - ("application_name", extra.application_name), - ("project", Some(&creds.endpoint)), + ("application_name", extra.application_name.as_str()), + ("project", creds.endpoint.as_str()), ]); request_builder = if extra.options.is_empty() { @@ -161,7 +161,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn get_auth_info( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { self.do_get_auth_info(extra, creds).await @@ -169,7 +169,7 @@ impl super::Api for Api { async fn get_allowed_ips( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result>, GetAuthInfoError> { let key: &str = &creds.endpoint; @@ -192,7 +192,7 @@ impl super::Api for Api { #[tracing::instrument(skip_all)] async fn wake_compute( &self, - extra: &ConsoleReqExtra<'_>, + extra: &ConsoleReqExtra, creds: &ComputeUserInfo, ) -> Result { let key: &str = &creds.inner.cache_key; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 018f774c7e..152c894ca9 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -671,7 +671,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { pub async fn connect_to_compute( mechanism: &M, mut node_info: console::CachedNodeInfo, - extra: &console::ConsoleReqExtra<'_>, + extra: &console::ConsoleReqExtra, creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, mut latency_timer: LatencyTimer, ) -> Result @@ -968,13 +968,17 @@ impl Client<'_, S> { allow_self_signed_compute, } = self; + let proto = mode.protocol_label(); let extra = console::ConsoleReqExtra { session_id, // aka this connection's id - application_name: params.get("application_name"), + application_name: format!( + "{}/{}", + params.get("application_name").unwrap_or_default(), + proto + ), options: neon_options(params), }; - - let mut latency_timer = LatencyTimer::new(mode.protocol_label()); + let mut latency_timer = LatencyTimer::new(proto); let user = creds.get_user().to_owned(); let auth_result = match creds @@ -1012,7 +1016,6 @@ impl Client<'_, S> { .or_else(|e| stream.throw_error(e)) .await?; - let proto = mode.protocol_label(); NUM_DB_CONNECTIONS_OPENED_COUNTER .with_label_values(&[proto]) .inc(); diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 31c3ad1055..4691abbfb9 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -484,13 +484,13 @@ fn helper_create_connect_info( mechanism: &TestConnectMechanism, ) -> ( CachedNodeInfo, - console::ConsoleReqExtra<'static>, + console::ConsoleReqExtra, auth::BackendType<'_, ComputeUserInfo>, ) { let cache = helper_create_cached_node_info(); let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), - application_name: Some("TEST"), + application_name: "TEST".into(), options: vec![], }; let creds = auth::BackendType::Test(mechanism); diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 734df11368..4f3b31b9be 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -37,7 +37,7 @@ use crate::proxy::ConnectMechanism; use tracing::{error, warn, Span}; use tracing::{info, info_span, Instrument}; -pub const APP_NAME: &str = "sql_over_http"; +pub const APP_NAME: &str = "/sql_over_http"; const MAX_CONNS_PER_ENDPOINT: usize = 20; #[derive(Debug, Clone)] @@ -432,7 +432,7 @@ async fn connect_to_compute( let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), - application_name: Some(APP_NAME), + application_name: APP_NAME.to_string(), options: console_options, }; // TODO(anna): this is a bit hacky way, consider using console notification listener. From 8bb4a13192c853cad2ab46a50e351324563e1ce2 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 12 Dec 2023 14:23:45 +0200 Subject: [PATCH 12/18] Do not materialize null images in PS (#5979) ## Problem PG16 is writing null images during relation extension. And page server implements optimisation which replace WAL record with FPI with page image. So instead of WAL record ~30 bytes we store 8kb null page image. Ans this image is almost useless, because most likely it will be shortly rewritten with actual page content. ## Summary of changes Do not materialize wal records with null page FPI. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik --- pageserver/src/walingest.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 75b29a2fed..738216afa5 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -458,8 +458,10 @@ impl<'a> WalIngest<'a> { && decoded.xl_rmid == pg_constants::RM_XLOG_ID && (decoded.xl_info == pg_constants::XLOG_FPI || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT) - // compression of WAL is not yet supported: fall back to storing the original WAL record + // compression of WAL is not yet supported: fall back to storing the original WAL record && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)? + // do not materialize null pages because them most likely be soon replaced with real data + && blk.bimg_len != 0 { // Extract page image from FPI record let img_len = blk.bimg_len as usize; From aec1acdbacd760833935f1140e245c0533182c6f Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 12 Dec 2023 14:24:21 +0200 Subject: [PATCH 13/18] Do not inherite replication slots in branch (#5898) ## Problem See https://github.com/neondatabase/company_projects/issues/111 https://neondb.slack.com/archives/C03H1K0PGKH/p1700166126954079 ## Summary of changes Do not search for AUX_FILES_KEY in parent timelines ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik Co-authored-by: Arseny Sher --- pageserver/src/pgdatadir_mapping.rs | 25 +++++++++++------ pageserver/src/tenant/timeline.rs | 4 +-- .../regress/test_logical_replication.py | 27 +++++++++++++++++++ 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c653f0b7ea..b81037ae47 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -822,10 +822,7 @@ impl<'a> DatadirModification<'a> { self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + self.init_aux_dir()?; let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { xids: HashSet::new(), @@ -933,10 +930,7 @@ impl<'a> DatadirModification<'a> { self.put(DBDIR_KEY, Value::Image(buf.into())); // Create AuxFilesDirectory as well - let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { - files: HashMap::new(), - })?; - self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + self.init_aux_dir()?; } if r.is_none() { // Create RelDirectory @@ -1261,6 +1255,14 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub fn init_aux_dir(&mut self) -> anyhow::Result<()> { + let buf = AuxFilesDirectory::ser(&AuxFilesDirectory { + files: HashMap::new(), + })?; + self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf))); + Ok(()) + } + pub async fn put_file( &mut self, path: &str, @@ -1767,6 +1769,13 @@ const AUX_FILES_KEY: Key = Key { // Reverse mappings for a few Keys. // These are needed by WAL redo manager. +// AUX_FILES currently stores only data for logical replication (slots etc), and +// we don't preserve these on a branch because safekeepers can't follow timeline +// switch (and generally it likely should be optional), so ignore these. +pub fn is_inherited_key(key: Key) -> bool { + key != AUX_FILES_KEY +} + pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f3907a6d2b..81dbc04793 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -66,7 +66,7 @@ use crate::metrics::{ TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT, }; use crate::pgdatadir_mapping::LsnForTimestamp; -use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key}; +use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key}; use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError}; use crate::tenant::config::{EvictionPolicy, TenantConfOpt}; use pageserver_api::reltag::RelTag; @@ -2278,7 +2278,7 @@ impl Timeline { } // Recurse into ancestor if needed - if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { + if is_inherited_key(key) && Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { trace!( "going into ancestor {}, cont_lsn is {}", timeline.ancestor_lsn, diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index d2d8d71e3f..51e358e60d 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -236,3 +236,30 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): assert vanilla_pg.safe_psql( "select sum(somedata) from replication_example" ) == endpoint.safe_psql("select sum(somedata) from replication_example") + + +# +# Check that slots are not inherited in brnach +# +def test_slots_and_branching(neon_simple_env: NeonEnv): + env = neon_simple_env + + tenant, timeline = env.neon_cli.create_tenant() + env.pageserver.http_client() + + main_branch = env.endpoints.create_start("main", tenant_id=tenant) + main_cur = main_branch.connect().cursor() + + # Create table and insert some data + main_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") + + wait_for_last_flush_lsn(env, main_branch, tenant, timeline) + + # Create branch ws. + env.neon_cli.create_branch("ws", "main", tenant_id=tenant) + ws_branch = env.endpoints.create_start("ws", tenant_id=tenant) + log.info("postgres is running on 'ws' branch") + + # Check that we can create slot with the same name + ws_cur = ws_branch.connect().cursor() + ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") From 6acbee23680317b317bc87dc86b504fcabaa8fd3 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Tue, 12 Dec 2023 16:24:13 +0000 Subject: [PATCH 14/18] test_runner: add `from_repo_dir` method (#6087) ## Problem We need a reliable way to restore a project state (in this context, I mean data on pageservers, safekeepers, and remote storage) from a snapshot. The existing method (that we use in `test_compatibility`) heavily relies on config files, which makes it harder to add/change fields in the config. The proposed solution uses config file only to get `default_tenant_id` and `branch_name_mappings`. ## Summary of changes - Add `NeonEnvBuilder#from_repo_dir` method, which allows using the `neon_env_builder` fixture with data from a snapshot. - Use `NeonEnvBuilder#from_repo_dir` in compatibility tests Requires for https://github.com/neondatabase/neon/issues/6033 --- test_runner/fixtures/neon_fixtures.py | 60 +++++ test_runner/regress/test_compatibility.py | 264 ++++------------------ 2 files changed, 101 insertions(+), 223 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fb6cea5713..4b23650960 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -507,6 +507,66 @@ class NeonEnvBuilder: return env + def from_repo_dir( + self, + repo_dir: Path, + neon_binpath: Optional[Path] = None, + pg_distrib_dir: Optional[Path] = None, + ) -> NeonEnv: + """ + A simple method to import data into the current NeonEnvBuilder from a snapshot of a repo dir. + """ + + # Setting custom `neon_binpath` and `pg_distrib_dir` is useful for compatibility tests + self.neon_binpath = neon_binpath or self.neon_binpath + self.pg_distrib_dir = pg_distrib_dir or self.pg_distrib_dir + + # Get the initial tenant and timeline from the snapshot config + snapshot_config_toml = repo_dir / "config" + with snapshot_config_toml.open("r") as f: + snapshot_config = toml.load(f) + + self.initial_tenant = TenantId(snapshot_config["default_tenant_id"]) + self.initial_timeline = TimelineId( + dict(snapshot_config["branch_name_mappings"][DEFAULT_BRANCH_NAME])[ + str(self.initial_tenant) + ] + ) + self.env = self.init_configs() + + for ps_dir in repo_dir.glob("pageserver_*"): + tenants_from_dir = ps_dir / "tenants" + tenants_to_dir = self.repo_dir / ps_dir.name / "tenants" + + log.info(f"Copying pageserver tenants directory {tenants_from_dir} to {tenants_to_dir}") + shutil.copytree(tenants_from_dir, tenants_to_dir) + + for sk_from_dir in (repo_dir / "safekeepers").glob("sk*"): + sk_to_dir = self.repo_dir / "safekeepers" / sk_from_dir.name + log.info(f"Copying safekeeper directory {sk_from_dir} to {sk_to_dir}") + sk_to_dir.rmdir() + shutil.copytree(sk_from_dir, sk_to_dir, ignore=shutil.ignore_patterns("*.log", "*.pid")) + + shutil.rmtree(self.repo_dir / "local_fs_remote_storage", ignore_errors=True) + shutil.copytree( + repo_dir / "local_fs_remote_storage", self.repo_dir / "local_fs_remote_storage" + ) + + if (attachments_json := Path(repo_dir / "attachments.json")).exists(): + shutil.copyfile(attachments_json, self.repo_dir / attachments_json.name) + + # Update the config with info about tenants and timelines + with (self.repo_dir / "config").open("r") as f: + config = toml.load(f) + + config["default_tenant_id"] = snapshot_config["default_tenant_id"] + config["branch_name_mappings"] = snapshot_config["branch_name_mappings"] + + with (self.repo_dir / "config").open("w") as f: + toml.dump(config, f) + + return self.env + def enable_scrub_on_exit(self): """ Call this if you would like the fixture to automatically run diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 35963c0d41..3f5de100fd 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -1,30 +1,25 @@ -import copy import os import shutil import subprocess import tempfile from pathlib import Path -from typing import Any, List, Optional +from typing import List, Optional import pytest import toml -from fixtures.log_helper import log from fixtures.neon_fixtures import ( - NeonCli, + NeonEnv, NeonEnvBuilder, PgBin, ) -from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( timeline_delete_wait_completed, wait_for_last_record_lsn, wait_for_upload, ) from fixtures.pg_version import PgVersion -from fixtures.port_distributor import PortDistributor -from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, RemoteStorageUser +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn -from pytest import FixtureRequest # # A test suite that help to prevent unintentionally breaking backward or forward compatibility between Neon releases. @@ -37,8 +32,8 @@ from pytest import FixtureRequest # If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true. # # The file contains a couple of helper functions: -# - prepare_snapshot copies the snapshot, cleans it up and makes it ready for the current version of Neon (replaces paths and ports in config files). # - check_neon_works performs the test itself, feel free to add more checks there. +# - dump_differs compares two SQL dumps and writes the diff to a file. # # # How to run `test_backward_compatibility` locally: @@ -46,6 +41,7 @@ from pytest import FixtureRequest # export DEFAULT_PG_VERSION=15 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # # # Build previous version of binaries and create a data snapshot: # rm -rf pg_install target @@ -59,8 +55,7 @@ from pytest import FixtureRequest # CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` # # # Run backward compatibility test -# COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} \ -# ./scripts/pytest -k test_backward_compatibility +# ./scripts/pytest -k test_backward_compatibility # # # How to run `test_forward_compatibility` locally: @@ -68,6 +63,8 @@ from pytest import FixtureRequest # export DEFAULT_PG_VERSION=15 # export BUILD_TYPE=release # export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} +# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install # # # Build previous version of binaries and store them somewhere: # rm -rf pg_install target @@ -84,9 +81,7 @@ from pytest import FixtureRequest # ./scripts/pytest -k test_create_snapshot # # # Run forward compatibility test -# COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} \ -# COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install \ -# ./scripts/pytest -k test_forward_compatibility +# ./scripts/pytest -k test_forward_compatibility # check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( @@ -155,13 +150,9 @@ def test_create_snapshot( @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_backward_compatibility( - pg_bin: PgBin, - port_distributor: PortDistributor, + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, - neon_binpath: Path, - pg_distrib_dir: Path, pg_version: PgVersion, - request: FixtureRequest, ): """ Test that the new binaries can read old data @@ -177,23 +168,15 @@ def test_backward_compatibility( ) try: - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, - ) + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") + neon_env_builder.start() check_neon_works( - test_output_dir / "compatibility_snapshot" / "repo", - neon_binpath, - neon_binpath, - pg_distrib_dir, - pg_version, - port_distributor, - test_output_dir, - pg_bin, - request, + env, + test_output_dir=test_output_dir, + sql_dump_path=compatibility_snapshot_dir / "dump.sql", + repo_dir=env.repo_dir, ) except Exception: if breaking_changes_allowed: @@ -212,12 +195,10 @@ def test_backward_compatibility( @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") def test_forward_compatibility( + neon_env_builder: NeonEnvBuilder, test_output_dir: Path, top_output_dir: Path, - port_distributor: PortDistributor, pg_version: PgVersion, - request: FixtureRequest, - neon_binpath: Path, ): """ Test that the old binaries can read new data @@ -244,24 +225,19 @@ def test_forward_compatibility( ) try: - # Copy the snapshot to current directory, and prepare for the test - prepare_snapshot( - from_dir=compatibility_snapshot_dir, - to_dir=test_output_dir / "compatibility_snapshot", - port_distributor=port_distributor, + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir( + compatibility_snapshot_dir / "repo", + neon_binpath=compatibility_neon_bin, pg_distrib_dir=compatibility_postgres_distrib_dir, ) + neon_env_builder.start() check_neon_works( - test_output_dir / "compatibility_snapshot" / "repo", - compatibility_neon_bin, - neon_binpath, - compatibility_postgres_distrib_dir, - pg_version, - port_distributor, - test_output_dir, - PgBin(test_output_dir, compatibility_postgres_distrib_dir, pg_version), - request, + env, + test_output_dir=test_output_dir, + sql_dump_path=compatibility_snapshot_dir / "dump.sql", + repo_dir=env.repo_dir, ) except Exception: if breaking_changes_allowed: @@ -276,189 +252,26 @@ def test_forward_compatibility( ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage" -def prepare_snapshot( - from_dir: Path, - to_dir: Path, - port_distributor: PortDistributor, - pg_distrib_dir: Optional[Path] = None, -): - assert from_dir.exists(), f"Snapshot '{from_dir}' doesn't exist" - assert (from_dir / "repo").exists(), f"Snapshot '{from_dir}' doesn't contain a repo directory" - assert (from_dir / "dump.sql").exists(), f"Snapshot '{from_dir}' doesn't contain a dump.sql" +def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path): + ep = env.endpoints.create_start("main") + pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version) - log.info(f"Copying snapshot from {from_dir} to {to_dir}") - shutil.copytree(from_dir, to_dir) - - repo_dir = to_dir / "repo" - - snapshot_config_toml = repo_dir / "config" - snapshot_config = toml.load(snapshot_config_toml) - - # Remove old logs to avoid confusion in test artifacts - for logfile in repo_dir.glob("**/*.log"): - logfile.unlink() - - # Remove old computes in 'endpoints'. Old versions of the control plane used a directory - # called "pgdatadirs". Delete it, too. - if (repo_dir / "endpoints").exists(): - shutil.rmtree(repo_dir / "endpoints") - if (repo_dir / "pgdatadirs").exists(): - shutil.rmtree(repo_dir / "pgdatadirs") - os.mkdir(repo_dir / "endpoints") - - # Update paths and ports in config files - legacy_pageserver_toml = repo_dir / "pageserver.toml" - legacy_bundle = os.path.exists(legacy_pageserver_toml) - - path_to_config: dict[Path, dict[Any, Any]] = {} - if legacy_bundle: - os.mkdir(repo_dir / "pageserver_1") - path_to_config[repo_dir / "pageserver_1" / "pageserver.toml"] = toml.load( - legacy_pageserver_toml - ) - os.remove(legacy_pageserver_toml) - os.rename(repo_dir / "tenants", repo_dir / "pageserver_1" / "tenants") - else: - for ps_conf in snapshot_config["pageservers"]: - config_path = repo_dir / f"pageserver_{ps_conf['id']}" / "pageserver.toml" - path_to_config[config_path] = toml.load(config_path) - - # For each pageserver config, edit it and rewrite - for config_path, pageserver_config in path_to_config.items(): - pageserver_config["remote_storage"]["local_path"] = str( - LocalFsStorage.component_path(repo_dir, RemoteStorageUser.PAGESERVER) - ) - - for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"): - pageserver_config[param] = port_distributor.replace_with_new_port( - pageserver_config[param] - ) - - # We don't use authentication in compatibility tests - # so just remove authentication related settings. - pageserver_config.pop("pg_auth_type", None) - pageserver_config.pop("http_auth_type", None) - - if pg_distrib_dir: - pageserver_config["pg_distrib_dir"] = str(pg_distrib_dir) - - with config_path.open("w") as f: - toml.dump(pageserver_config, f) - - # neon_local config doesn't have to be backward compatible. If we're using a dump from before - # it supported multiple pageservers, fix it up. - if "pageservers" not in snapshot_config: - snapshot_config["pageservers"] = [snapshot_config["pageserver"]] - del snapshot_config["pageserver"] - - for param in ("listen_http_addr", "listen_pg_addr"): - for pageserver in snapshot_config["pageservers"]: - pageserver[param] = port_distributor.replace_with_new_port(pageserver[param]) - snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port( - snapshot_config["broker"]["listen_addr"] - ) - for sk in snapshot_config["safekeepers"]: - for param in ("http_port", "pg_port", "pg_tenant_only_port"): - sk[param] = port_distributor.replace_with_new_port(sk[param]) - - if pg_distrib_dir: - snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir) - - with snapshot_config_toml.open("w") as f: - toml.dump(snapshot_config, f) - - # Ensure that snapshot doesn't contain references to the original path - rv = subprocess.run( - [ - "grep", - "--recursive", - "--binary-file=without-match", - "--files-with-matches", - "test_create_snapshot/repo", - str(repo_dir), - ], - capture_output=True, - text=True, - ) - assert ( - rv.returncode != 0 - ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" - - -def check_neon_works( - repo_dir: Path, - neon_target_binpath: Path, - neon_current_binpath: Path, - pg_distrib_dir: Path, - pg_version: PgVersion, - port_distributor: PortDistributor, - test_output_dir: Path, - pg_bin: PgBin, - request: FixtureRequest, -): - snapshot_config_toml = repo_dir / "config" - snapshot_config = toml.load(snapshot_config_toml) - snapshot_config["neon_distrib_dir"] = str(neon_target_binpath) - snapshot_config["postgres_distrib_dir"] = str(pg_distrib_dir) - with (snapshot_config_toml).open("w") as f: - toml.dump(snapshot_config, f) - - # TODO: replace with NeonEnvBuilder / NeonEnv - config: Any = type("NeonEnvStub", (object,), {}) - config.rust_log_override = None - config.repo_dir = repo_dir - config.pg_version = pg_version - config.initial_tenant = snapshot_config["default_tenant_id"] - config.pg_distrib_dir = pg_distrib_dir - config.remote_storage = None - config.safekeepers_remote_storage = None - - # Use the "target" binaries to launch the storage nodes - config_target = config - config_target.neon_binpath = neon_target_binpath - # We are using maybe-old binaries for neon services, but want to use current - # binaries for test utilities like neon_local - config_target.neon_local_binpath = neon_current_binpath - cli_target = NeonCli(config_target) - - # And the current binaries to launch computes - snapshot_config["neon_distrib_dir"] = str(neon_current_binpath) - with (snapshot_config_toml).open("w") as f: - toml.dump(snapshot_config, f) - config_current = copy.copy(config) - config_current.neon_binpath = neon_current_binpath - cli_current = NeonCli(config_current) - - cli_target.raw_cli(["start"]) - request.addfinalizer(lambda: cli_target.raw_cli(["stop"])) - - pg_port = port_distributor.get_port() - http_port = port_distributor.get_port() - cli_current.endpoint_create( - branch_name="main", pg_port=pg_port, http_port=http_port, endpoint_id="ep-main" - ) - cli_current.endpoint_start("ep-main") - request.addfinalizer(lambda: cli_current.endpoint_stop("ep-main")) - - connstr = f"host=127.0.0.1 port={pg_port} user=cloud_admin dbname=postgres" + connstr = ep.connstr() pg_bin.run_capture( ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"] ) initial_dump_differs = dump_differs( - repo_dir.parent / "dump.sql", + sql_dump_path, test_output_dir / "dump.sql", test_output_dir / "dump.filediff", ) # Check that project can be recovered from WAL # loosely based on https://www.notion.so/neondatabase/Storage-Recovery-from-WAL-d92c0aac0ebf40df892b938045d7d720 - tenant_id = snapshot_config["default_tenant_id"] - timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id] - pageserver_port = snapshot_config["pageservers"][0]["listen_http_addr"].split(":")[-1] - pageserver_http = PageserverHttpClient( - port=pageserver_port, - is_testing_enabled_or_skip=lambda: True, # TODO: check if testing really enabled - ) + pageserver_http = env.pageserver.http_client() + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + pg_version = env.pg_version shutil.rmtree(repo_dir / "local_fs_remote_storage") timeline_delete_wait_completed(pageserver_http, tenant_id, timeline_id) @@ -494,6 +307,11 @@ def dump_differs( Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False). """ + if not first.exists(): + raise FileNotFoundError(f"{first} doesn't exist") + if not second.exists(): + raise FileNotFoundError(f"{second} doesn't exist") + with output.open("w") as stdout: res = subprocess.run( [ From dfb0a6fdaf8eecbfacf5b1d9a5dc62ec26bd64c9 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 12 Dec 2023 16:53:08 +0000 Subject: [PATCH 15/18] scrubber: handle initdb files, fix an issue with prefixes (#6079) - The code for calculating the prefix in the bucket was expecting a trailing slash (as it is in the tests), but that's an awkward expectation to impose for use in the field: make the code more flexible by only trimming a trailing character if it is indeed a slash. - initdb archives were detected by the scrubber as malformed layer files. Teach it to recognize and ignore them. --- s3_scrubber/src/checks.rs | 22 +++++++++++++++++++++- s3_scrubber/src/lib.rs | 4 +++- s3_scrubber/src/main.rs | 13 ++++++++++++- s3_scrubber/src/scan_metadata.rs | 4 ++++ 4 files changed, 40 insertions(+), 3 deletions(-) diff --git a/s3_scrubber/src/checks.rs b/s3_scrubber/src/checks.rs index 510a128663..a15a908212 100644 --- a/s3_scrubber/src/checks.rs +++ b/s3_scrubber/src/checks.rs @@ -142,7 +142,9 @@ pub(crate) async fn branch_cleanup_and_check_errors( .collect(); if !orphan_layers.is_empty() { - result.errors.push(format!( + // An orphan layer is not an error: it's arguably not even a warning, but it is helpful to report + // these as a hint that there is something worth cleaning up here. + result.warnings.push(format!( "index_part.json does not contain layers from S3: {:?}", orphan_layers .iter() @@ -170,6 +172,7 @@ pub(crate) async fn branch_cleanup_and_check_errors( )); } } + BlobDataParseResult::Relic => {} BlobDataParseResult::Incorrect(parse_errors) => result.errors.extend( parse_errors .into_iter() @@ -215,6 +218,8 @@ pub(crate) enum BlobDataParseResult { index_part_generation: Generation, s3_layers: HashSet<(LayerFileName, Generation)>, }, + /// The remains of a deleted Timeline (i.e. an initdb archive only) + Relic, Incorrect(Vec), } @@ -245,6 +250,7 @@ pub(crate) async fn list_timeline_blobs( timeline_dir_target.delimiter = String::new(); let mut index_parts: Vec = Vec::new(); + let mut initdb_archive: bool = false; let stream = stream_listing(s3_client, &timeline_dir_target); pin_mut!(stream); @@ -258,6 +264,10 @@ pub(crate) async fn list_timeline_blobs( tracing::info!("Index key {key}"); index_parts.push(obj) } + Some("initdb.tar.zst") => { + tracing::info!("initdb archive {key}"); + initdb_archive = true; + } Some(maybe_layer_name) => match parse_layer_object_name(maybe_layer_name) { Ok((new_layer, gen)) => { tracing::info!("Parsed layer key: {} {:?}", new_layer, gen); @@ -279,6 +289,16 @@ pub(crate) async fn list_timeline_blobs( } } + if index_parts.is_empty() && s3_layers.is_empty() && initdb_archive { + tracing::info!( + "Timeline is empty apart from initdb archive: expected post-deletion state." + ); + return Ok(S3TimelineBlobData { + blob_data: BlobDataParseResult::Relic, + keys_to_remove: Vec::new(), + }); + } + // Choose the index_part with the highest generation let (index_part_object, index_part_generation) = match index_parts .iter() diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs index e5465952fb..6607db21e6 100644 --- a/s3_scrubber/src/lib.rs +++ b/s3_scrubber/src/lib.rs @@ -86,7 +86,9 @@ impl S3Target { if new_self.prefix_in_bucket.is_empty() { new_self.prefix_in_bucket = format!("/{}/", new_segment); } else { - let _ = new_self.prefix_in_bucket.pop(); + if new_self.prefix_in_bucket.ends_with('/') { + new_self.prefix_in_bucket.pop(); + } new_self.prefix_in_bucket = [&new_self.prefix_in_bucket, new_segment, ""].join(&new_self.delimiter); } diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs index 1f0ceebdaf..ef020edc2a 100644 --- a/s3_scrubber/src/main.rs +++ b/s3_scrubber/src/main.rs @@ -57,7 +57,7 @@ async fn main() -> anyhow::Result<()> { )); match cli.command { - Command::ScanMetadata { json } => match scan_metadata(bucket_config).await { + Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await { Err(e) => { tracing::error!("Failed: {e}"); Err(e) @@ -70,6 +70,17 @@ async fn main() -> anyhow::Result<()> { } if summary.is_fatal() { Err(anyhow::anyhow!("Fatal scrub errors detected")) + } else if summary.is_empty() { + // Strictly speaking an empty bucket is a valid bucket, but if someone ran the + // scrubber they were likely expecting to scan something, and if we see no timelines + // at all then it's likely due to some configuration issues like a bad prefix + Err(anyhow::anyhow!( + "No timelines found in bucket {} prefix {}", + bucket_config.bucket, + bucket_config + .prefix_in_bucket + .unwrap_or("".to_string()) + )) } else { Ok(()) } diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs index ad82db1e76..228f8d6763 100644 --- a/s3_scrubber/src/scan_metadata.rs +++ b/s3_scrubber/src/scan_metadata.rs @@ -174,6 +174,10 @@ Timeline layer count: {6} pub fn is_fatal(&self) -> bool { !self.with_errors.is_empty() } + + pub fn is_empty(&self) -> bool { + self.count == 0 + } } /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics. From 5820faaa876c35b0504b1f5136d851437651a765 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 12 Dec 2023 18:00:37 +0100 Subject: [PATCH 16/18] Use extend instead of groups of append calls in tests (#6109) Repeated calls to `.append` don't line up as nicely as they might get formatted in different ways. Also, it is more characters and the lines might be longer. Saw this while working on #5912. --- test_runner/regress/test_auth.py | 11 ++-- test_runner/regress/test_branch_behind.py | 5 +- test_runner/regress/test_branching.py | 20 +++--- test_runner/regress/test_import.py | 13 ++-- .../test_pageserver_metric_collection.py | 28 ++++---- test_runner/regress/test_remote_storage.py | 25 +++---- test_runner/regress/test_tenant_delete.py | 14 ++-- test_runner/regress/test_tenant_detach.py | 12 ++-- test_runner/regress/test_tenant_relocation.py | 17 ++--- .../test_tenants_with_remote_storage.py | 23 ++++--- .../regress/test_threshold_based_eviction.py | 13 ++-- test_runner/regress/test_timeline_delete.py | 66 ++++++++++--------- 12 files changed, 133 insertions(+), 114 deletions(-) diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 7487106c44..bd87ff3efd 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -92,8 +92,9 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") - env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") + env.pageserver.allowed_errors.extend( + [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"] + ) pageserver_token_old = env.auth_keys.generate_pageserver_token() pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) @@ -145,9 +146,9 @@ def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder): def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder): neon_env_builder.auth_enabled = True env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*") - env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*") - + env.pageserver.allowed_errors.extend( + [".*Authentication error: InvalidSignature.*", ".*Unauthorized: malformed jwt token.*"] + ) pageserver_token_old = env.auth_keys.generate_pageserver_token() pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old) diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index a19b2862f8..9879254897 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -14,8 +14,9 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}" env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*invalid branch start lsn.*") - env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*") + env.pageserver.allowed_errors.extend( + [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"] + ) # Branch at the point where only 100 rows were inserted branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index a908dd713a..82ca985d01 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -148,11 +148,11 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" - ) - env.pageserver.allowed_errors.append( - ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*page_service_conn_main.*: query handler for 'basebackup .* is not active, state: Loading", + ] ) ps_http = env.pageserver.http_client() @@ -247,11 +247,11 @@ def test_competing_branchings_from_loading_race_to_ok_or_err(neon_env_builder: N env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.append( - ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*" - ) - env.pageserver.allowed_errors.append( - ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory" + env.pageserver.allowed_errors.extend( + [ + ".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*", + ".*Error processing HTTP request: InternalServerError\\(Timeline .*/.* already exists in pageserver's memory", + ] ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 920e8d0b72..faedf5d944 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -99,12 +99,13 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ] ) - # FIXME: we should clean up pageserver to not print this - env.pageserver.allowed_errors.append(".*exited with error: unexpected message type: CopyData.*") - - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: we should clean up pageserver to not print this + ".*exited with error: unexpected message type: CopyData.*", + # FIXME: Is this expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + ] ) def import_tar(base, wal): diff --git a/test_runner/regress/test_pageserver_metric_collection.py b/test_runner/regress/test_pageserver_metric_collection.py index b76dbbee03..042961baa5 100644 --- a/test_runner/regress/test_pageserver_metric_collection.py +++ b/test_runner/regress/test_pageserver_metric_collection.py @@ -64,13 +64,13 @@ def test_metric_collection( # spin up neon, after http server is ready env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) # httpserver is shut down before pageserver during passing run - env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") - # we have a fast rate of calculation, these can happen at shutdown - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" - ) - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + ".*metrics endpoint refused the sent metrics*", + # we have a fast rate of calculation, these can happen at shutdown + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ] ) tenant_id = env.initial_tenant @@ -212,13 +212,13 @@ def test_metric_collection_cleans_up_tempfile( pageserver_http = env.pageserver.http_client() # httpserver is shut down before pageserver during passing run - env.pageserver.allowed_errors.append(".*metrics endpoint refused the sent metrics*") - # we have a fast rate of calculation, these can happen at shutdown - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*" - ) - env.pageserver.allowed_errors.append( - ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + ".*metrics endpoint refused the sent metrics*", + # we have a fast rate of calculation, these can happen at shutdown + ".*synthetic_size_worker:calculate_synthetic_size.*:gather_size_inputs.*: failed to calculate logical size at .*: cancelled.*", + ".*synthetic_size_worker: failed to calculate synthetic size for tenant .*: failed to calculate some logical_sizes", + ] ) tenant_id = env.initial_tenant diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0a5046e219..3004d69f50 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -73,19 +73,20 @@ def test_remote_storage_backup_and_restore( ##### First start, insert data and upload it to the remote storage env = neon_env_builder.init_start() - # FIXME: Is this expected? - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: Is this expected? + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ".*No timelines to attach received.*", + ".*Failed to get local tenant state.*", + # FIXME retry downloads without throwing errors + ".*failed to load remote timeline.*", + # we have a bunch of pytest.raises for these below + ".*tenant .*? already exists, state:.*", + ".*tenant directory already exists.*", + ".*simulated failure of remote operation.*", + ] ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - - env.pageserver.allowed_errors.append(".*Failed to get local tenant state.*") - # FIXME retry downloads without throwing errors - env.pageserver.allowed_errors.append(".*failed to load remote timeline.*") - # we have a bunch of pytest.raises for these below - env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*") - env.pageserver.allowed_errors.append(".*tenant directory already exists.*") - env.pageserver.allowed_errors.append(".*simulated failure of remote operation.*") pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 48f5682371..fece876459 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -395,13 +395,13 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE env.start() pageserver_http = env.pageserver.http_client() - # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero - env.pageserver.allowed_errors.append( - ".*Timeline got dropped without initializing, cleaning its files" - ) - # the response hit_pausable_failpoint_and_later_fail - env.pageserver.allowed_errors.append( - f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn" + env.pageserver.allowed_errors.extend( + [ + # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero + ".*Timeline got dropped without initializing, cleaning its files", + # the response hit_pausable_failpoint_and_later_fail + f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn", + ] ) env.pageserver.tenant_create(env.initial_tenant) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 5b63bd6161..0dcbb23ad4 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -307,10 +307,14 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder): bogus_timeline_id = TimelineId.generate() pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0) - # the error will be printed to the log too - env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*") - # Timelines get stopped during detach, ignore the gc calls that error, witnessing that - env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*") + env.pageserver.allowed_errors.extend( + [ + # the error will be printed to the log too + ".*gc target timeline does not exist.*", + # Timelines get stopped during detach, ignore the gc calls that error, witnessing that + ".*InternalServerError\\(timeline is Stopping.*", + ] + ) # Detach while running manual GC. # It should wait for manual GC to finish because it runs in a task associated with the tenant. diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index feacdcc802..dcd7232b1b 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -216,16 +216,17 @@ def test_tenant_relocation( tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209") - # FIXME: Is this expected? - env.pageservers[0].allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageservers[0].allowed_errors.extend( + [ + # FIXME: Is this expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + # Needed for detach polling on the original pageserver + f".*NotFound: tenant {tenant_id}.*", + # We will dual-attach in this test, so stale generations are expected + ".*Dropped remote consistent LSN updates.*", + ] ) - # Needed for detach polling on the original pageserver - env.pageservers[0].allowed_errors.append(f".*NotFound: tenant {tenant_id}.*") - # We will dual-attach in this test, so stale generations are expected - env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates.*") - assert isinstance(env.pageserver_remote_storage, LocalFsStorage) # we use two branches to check that they are both relocated diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index b7b4e2be0b..07fb6dc5ca 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -117,10 +117,12 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder): ##### First start, insert secret data and upload it to the remote storage env = neon_env_builder.init_start() - # FIXME: Are these expected? - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") - env.pageserver.allowed_errors.append( - ".*marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + # FIXME: Are these expected? + ".*No timelines to attach received.*", + ".*marking .* as locally complete, while it doesnt exist in remote index.*", + ] ) pageserver_http = env.pageserver.http_client() @@ -218,13 +220,14 @@ def test_tenant_redownloads_truncated_file_on_startup( assert isinstance(env.pageserver_remote_storage, LocalFsStorage) - env.pageserver.allowed_errors.append(".*removing local file .* because .*") - - # FIXME: Are these expected? - env.pageserver.allowed_errors.append( - ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*" + env.pageserver.allowed_errors.extend( + [ + ".*removing local file .* because .*", + # FIXME: Are these expected? + ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*", + ".*No timelines to attach received.*", + ] ) - env.pageserver.allowed_errors.append(".*No timelines to attach received.*") pageserver_http = env.pageserver.http_client() endpoint = env.endpoints.create_start("main") diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 27d5cce5f2..5f72cfd747 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -36,12 +36,13 @@ def test_threshold_based_eviction( ".*metrics_collection:.* upload consumption_metrics (still failed|failed, will retry).*" ) env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(metrics_refused_log_line) - - # these can happen whenever we run consumption metrics collection - env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled") - env.pageserver.allowed_errors.append( - r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes" + env.pageserver.allowed_errors.extend( + [ + metrics_refused_log_line, + # these can happen whenever we run consumption metrics collection + r".*failed to calculate logical size at \S+: cancelled", + r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes", + ] ) tenant_id, timeline_id = env.initial_tenant, env.initial_timeline diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 17113a6bc5..c6d578a7a2 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -39,10 +39,14 @@ from urllib3.util.retry import Retry def test_timeline_delete(neon_simple_env: NeonEnv): env = neon_simple_env - env.pageserver.allowed_errors.append(".*Timeline .* was not found.*") - env.pageserver.allowed_errors.append(".*timeline not found.*") - env.pageserver.allowed_errors.append(".*Cannot delete timeline which has child timelines.*") - env.pageserver.allowed_errors.append(".*Precondition failed: Requested tenant is missing.*") + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was not found.*", + ".*timeline not found.*", + ".*Cannot delete timeline which has child timelines.*", + ".*Precondition failed: Requested tenant is missing.*", + ] + ) ps_http = env.pageserver.http_client() @@ -198,22 +202,22 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ), ) - env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") - # It appears when we stopped flush loop during deletion and then pageserver is stopped - env.pageserver.allowed_errors.append( - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + env.pageserver.allowed_errors.extend( + [ + f".*{timeline_id}.*failpoint: {failpoint}", + # It appears when we stopped flush loop during deletion and then pageserver is stopped + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + # This happens when we fail before scheduling background operation. + # Timeline is left in stopping state and retry tries to stop it again. + ".*Ignoring new state, equal to the existing one: Stopping", + # This happens when we retry delete requests for broken timelines + ".*Ignoring state update Stopping for broken timeline", + # This happens when timeline remains are cleaned up during loading + ".*Timeline dir entry become invalid.*", + # In one of the branches we poll for tenant to become active. Polls can generate this log message: + f".*Tenant {env.initial_tenant} is not active*", + ] ) - # This happens when we fail before scheduling background operation. - # Timeline is left in stopping state and retry tries to stop it again. - env.pageserver.allowed_errors.append( - ".*Ignoring new state, equal to the existing one: Stopping" - ) - # This happens when we retry delete requests for broken timelines - env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") - # This happens when timeline remains are cleaned up during loading - env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*") - # In one of the branches we poll for tenant to become active. Polls can generate this log message: - env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*") ps_http.configure_failpoints((failpoint, "return")) @@ -398,13 +402,13 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild env = neon_env_builder.init_start() - env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm") - env.pageserver.allowed_errors.append( - ".*Ignoring new state, equal to the existing one: Stopping" - ) - # this happens, because the stuck timeline is visible to shutdown - env.pageserver.allowed_errors.append( - ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + env.pageserver.allowed_errors.extend( + [ + ".*failpoint: timeline-delete-before-rm", + ".*Ignoring new state, equal to the existing one: Stopping", + # this happens, because the stuck timeline is visible to shutdown + ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited", + ] ) ps_http = env.pageserver.http_client() @@ -551,10 +555,12 @@ def test_concurrent_timeline_delete_stuck_on( with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err: ps_http.timeline_delete(env.initial_tenant, child_timeline_id) assert second_call_err.value.status_code == 409 - env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*") - # the second call will try to transition the timeline into Stopping state as well - env.pageserver.allowed_errors.append( - f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping" + env.pageserver.allowed_errors.extend( + [ + f".*{child_timeline_id}.*{error_msg_re}.*", + # the second call will try to transition the timeline into Stopping state as well + f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping", + ] ) log.info("second call failed as expected") From 7c2c87a5abb8f7710c4efa2c8ba60c36a17a1cc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 12 Dec 2023 18:20:12 +0100 Subject: [PATCH 17/18] Update azure SDK to 0.18 and use open range support (#6103) * Update `azure-*` crates to 0.18 * Use new open ranges support added by upstream in https://github.com/Azure/azure-sdk-for-rust/pull/1482 Part of #5567. Prior update PR: #6081 --- Cargo.lock | 42 ++++++++++++++++++--------- Cargo.toml | 8 ++--- libs/remote_storage/src/azure_blob.rs | 15 ++++------ 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index da480b8e0c..1b6b423444 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -694,9 +694,9 @@ dependencies = [ [[package]] name = "azure_core" -version = "0.17.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ccd63c07d1fbfb3d4543d7ea800941bf5a30db1911b9b9e4db3b2c4210a434f" +checksum = "a6218987c374650fdad0b476bfc675729762c28dfb35f58608a38a2b1ea337dd" dependencies = [ "async-trait", "base64 0.21.1", @@ -704,8 +704,10 @@ dependencies = [ "dyn-clone", "futures", "getrandom 0.2.11", + "hmac", "http-types", "log", + "once_cell", "paste", "pin-project", "quick-xml", @@ -714,6 +716,7 @@ dependencies = [ "rustc_version", "serde", "serde_json", + "sha2", "time", "url", "uuid", @@ -721,9 +724,9 @@ dependencies = [ [[package]] name = "azure_identity" -version = "0.17.0" +version = "0.18.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bd7ea32ca7eb66ff4757f83baac702ff11d469e5de365b6bc6f79f9c25d3436" +checksum = "9e1eacc4f7fb2a73d57c39139d0fc3aed78435606055779ddaef4b43cdf919a8" dependencies = [ "async-lock", "async-trait", @@ -733,7 +736,6 @@ dependencies = [ "oauth2", "pin-project", "serde", - "serde_json", "time", "tz-rs", "url", @@ -742,21 +744,18 @@ dependencies = [ [[package]] name = "azure_storage" -version = "0.17.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83ca0a07f89fd72a006da4713e93af3d6c44a693e61a1c3c2e7985de39c182e8" +checksum = "ade8f2653e408de88b9eafec9f48c3c26b94026375e88adbd34523a7dd9795a1" dependencies = [ "RustyXML", + "async-lock", "async-trait", "azure_core", "bytes", - "futures", - "hmac", "log", "serde", "serde_derive", - "serde_json", - "sha2", "time", "url", "uuid", @@ -764,13 +763,14 @@ dependencies = [ [[package]] name = "azure_storage_blobs" -version = "0.17.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8096c04d370118323c42b2752aa1883e4880a56ef65239f317b359f263b6e194" +checksum = "025701c7cc5b523100f0f3b2b01723564ec5a86c03236521c06826337047e872" dependencies = [ "RustyXML", "azure_core", "azure_storage", + "azure_svc_blobstorage", "bytes", "futures", "log", @@ -782,6 +782,22 @@ dependencies = [ "uuid", ] +[[package]] +name = "azure_svc_blobstorage" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76051e5bb67cea1055abe5e530a0878feac7e0ab4cbbcb4a6adc953a58993389" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backtrace" version = "0.3.67" diff --git a/Cargo.toml b/Cargo.toml index b5eece5e35..496a9d7839 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,10 +38,10 @@ license = "Apache-2.0" anyhow = { version = "1.0", features = ["backtrace"] } arc-swap = "1.6" async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] } -azure_core = "0.17" -azure_identity = "0.17" -azure_storage = "0.17" -azure_storage_blobs = "0.17" +azure_core = "0.18" +azure_identity = "0.18" +azure_storage = "0.18" +azure_storage_blobs = "0.18" flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index e559d00ded..548bde02f6 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -271,17 +271,12 @@ impl RemoteStorage for AzureBlobStorage { let mut builder = blob_client.get(); - if let Some(end_exclusive) = end_exclusive { - builder = builder.range(Range::new(start_inclusive, end_exclusive)); + let range: Range = if let Some(end_exclusive) = end_exclusive { + (start_inclusive..end_exclusive).into() } else { - // Open ranges are not supported by the SDK so we work around - // by setting the upper limit extremely high (but high enough - // to still be representable by signed 64 bit integers). - // TODO remove workaround once the SDK adds open range support - // https://github.com/Azure/azure-sdk-for-rust/issues/1438 - let end_exclusive = u64::MAX / 4; - builder = builder.range(Range::new(start_inclusive, end_exclusive)); - } + (start_inclusive..).into() + }; + builder = builder.range(range); self.download_for_builder(builder).await } From 8460654f61d59dc639db282473148e2b8ef50b09 Mon Sep 17 00:00:00 2001 From: Stas Kelvich Date: Wed, 13 Dec 2023 03:52:10 +0200 Subject: [PATCH 18/18] Add per-endpoint rate limiter to proxy --- proxy/src/auth.rs | 8 ++++ proxy/src/bin/proxy.rs | 4 ++ proxy/src/config.rs | 1 + proxy/src/proxy.rs | 21 +++++++++ proxy/src/rate_limiter.rs | 1 + proxy/src/rate_limiter/limiter.rs | 71 +++++++++++++++++++++++++++++++ proxy/src/serverless.rs | 7 +++ proxy/src/serverless/websocket.rs | 4 ++ 8 files changed, 117 insertions(+) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 7d79d34045..eadb9abd43 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -62,6 +62,9 @@ pub enum AuthErrorImpl { Please add it to the allowed list in the Neon console." )] IpAddressNotAllowed, + + #[error("Too many connections to this endpoint. Please try again later.")] + TooManyConnections, } #[derive(Debug, Error)] @@ -80,6 +83,10 @@ impl AuthError { pub fn ip_address_not_allowed() -> Self { AuthErrorImpl::IpAddressNotAllowed.into() } + + pub fn too_many_connections() -> Self { + AuthErrorImpl::TooManyConnections.into() + } } impl> From for AuthError { @@ -102,6 +109,7 @@ impl UserFacingError for AuthError { MissingEndpointName => self.to_string(), Io(_) => "Internal error".to_string(), IpAddressNotAllowed => self.to_string(), + TooManyConnections => self.to_string(), } } } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index fc1c44809a..1fa2d5599f 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -112,6 +112,9 @@ struct ProxyCliArgs { /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error. #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] rate_limiter_timeout: tokio::time::Duration, + /// Endpoint rate limiter max number of requests per second. + #[clap(long, default_value_t = 300)] + endpoint_rps_limit: u32, /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`. #[clap(long, default_value_t = 100)] initial_limit: usize, @@ -317,6 +320,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { authentication_config, require_client_ip: args.require_client_ip, disable_ip_check_for_http: args.disable_ip_check_for_http, + endpoint_rps_limit: args.endpoint_rps_limit, })); Ok(config) diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 182d71f9be..dea446eb22 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -20,6 +20,7 @@ pub struct ProxyConfig { pub authentication_config: AuthenticationConfig, pub require_client_ip: bool, pub disable_ip_check_for_http: bool, + pub endpoint_rps_limit: u32, } #[derive(Debug)] diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 152c894ca9..ae8b294841 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -9,6 +9,7 @@ use crate::{ console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api}, http::StatusCode, protocol2::WithClientIp, + rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, usage_metrics::{Ids, USAGE_METRICS}, }; @@ -307,6 +308,7 @@ pub async fn task_main( let connections = tokio_util::task::task_tracker::TaskTracker::new(); let cancel_map = Arc::new(CancelMap::default()); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit)); while let Some(accept_result) = run_until_cancelled(listener.accept(), &cancellation_token).await @@ -315,6 +317,8 @@ pub async fn task_main( let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); + connections.spawn( async move { info!("accepted postgres client connection"); @@ -340,6 +344,7 @@ pub async fn task_main( socket, ClientMode::Tcp, peer_addr.ip(), + endpoint_rate_limiter, ) .await } @@ -415,6 +420,7 @@ pub async fn handle_client( stream: S, mode: ClientMode, peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { info!( protocol = mode.protocol_label(), @@ -463,6 +469,7 @@ pub async fn handle_client( ¶ms, session_id, mode.allow_self_signed_compute(config), + endpoint_rate_limiter, ); cancel_map .with_session(|session| client.connect_to_db(session, mode, &config.authentication_config)) @@ -928,6 +935,8 @@ struct Client<'a, S> { session_id: uuid::Uuid, /// Allow self-signed certificates (for testing). allow_self_signed_compute: bool, + /// Rate limiter for endpoints + endpoint_rate_limiter: Arc, } impl<'a, S> Client<'a, S> { @@ -938,6 +947,7 @@ impl<'a, S> Client<'a, S> { params: &'a StartupMessageParams, session_id: uuid::Uuid, allow_self_signed_compute: bool, + endpoint_rate_limiter: Arc, ) -> Self { Self { stream, @@ -945,6 +955,7 @@ impl<'a, S> Client<'a, S> { params, session_id, allow_self_signed_compute, + endpoint_rate_limiter, } } } @@ -966,8 +977,18 @@ impl Client<'_, S> { params, session_id, allow_self_signed_compute, + endpoint_rate_limiter, } = self; + // check rate limit + if let Some(ep) = creds.get_endpoint() { + if !endpoint_rate_limiter.check(ep) { + return stream + .throw_error(auth::AuthError::too_many_connections()) + .await; + } + } + let proto = mode.protocol_label(); let extra = console::ConsoleReqExtra { session_id, // aka this connection's id diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index 5622c44a68..f40b8dbd1c 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -3,4 +3,5 @@ mod limit_algorithm; mod limiter; pub use aimd::Aimd; pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig}; +pub use limiter::EndpointRateLimiter; pub use limiter::Limiter; diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 3a9fed3919..9d28bb67b3 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -6,6 +6,9 @@ use std::{ time::Duration, }; +use dashmap::DashMap; +use parking_lot::Mutex; +use smol_str::SmolStr; use tokio::sync::{Mutex as AsyncMutex, Semaphore, SemaphorePermit}; use tokio::time::{timeout, Instant}; use tracing::info; @@ -15,6 +18,74 @@ use super::{ RateLimiterConfig, }; +// Simple per-endpoint rate limiter. +// +// Check that number of connections to the endpoint is below `max_rps` rps. +// Purposefully ignore user name and database name as clients can reconnect +// with different names, so we'll end up sending some http requests to +// the control plane. +// +// We also may save quite a lot of CPU (I think) by bailing out right after we +// saw SNI, before doing TLS handshake. User-side error messages in that case +// does not look very nice (`SSL SYSCALL error: Undefined error: 0`), so for now +// I went with a more expensive way that yields user-friendlier error messages. +// +// TODO: add a better bucketing here, e.g. not more than 300 requests per second, +// and not more than 1000 requests per 10 seconds, etc. Short bursts of reconnects +// are noramal during redeployments, so we should not block them. +pub struct EndpointRateLimiter { + map: DashMap>>, + max_rps: u32, + access_count: AtomicUsize, +} + +impl EndpointRateLimiter { + pub fn new(max_rps: u32) -> Self { + Self { + map: DashMap::new(), + max_rps, + access_count: AtomicUsize::new(1), // start from 1 to avoid GC on the first request + } + } + + /// Check that number of connections to the endpoint is below `max_rps` rps. + pub fn check(&self, endpoint: SmolStr) -> bool { + // do GC every 100k requests (worst case memory usage is about 10MB) + if self.access_count.fetch_add(1, Ordering::AcqRel) % 100_000 == 0 { + self.do_gc(); + } + + let now = chrono::Utc::now().naive_utc().time(); + let entry = self + .map + .entry(endpoint) + .or_insert_with(|| Arc::new(Mutex::new((now, 0)))); + let mut entry = entry.lock(); + let (last_time, count) = *entry; + + if now - last_time < chrono::Duration::seconds(1) { + if count >= self.max_rps { + return false; + } + *entry = (last_time, count + 1); + } else { + *entry = (now, 1); + } + true + } + + /// Clean the map. Simple strategy: remove all entries. At worst, we'll + /// double the effective max_rps during the cleanup. But that way deletion + /// does not aquire mutex on each entry access. + pub fn do_gc(&self) { + info!( + "cleaning up endpoint rate limiter, current size = {}", + self.map.len() + ); + self.map.clear(); + } +} + /// Limits the number of concurrent jobs. /// /// Concurrency is limited through the use of [Token]s. Acquire a token to run a job, and release the diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index cd496ff01e..92d6e2d851 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -14,6 +14,7 @@ use tokio_util::task::TaskTracker; use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; use crate::proxy::{NUM_CLIENT_CONNECTION_CLOSED_COUNTER, NUM_CLIENT_CONNECTION_OPENED_COUNTER}; +use crate::rate_limiter::EndpointRateLimiter; use crate::{cancellation::CancelMap, config::ProxyConfig}; use futures::StreamExt; use hyper::{ @@ -43,6 +44,7 @@ pub async fn task_main( } let conn_pool = conn_pool::GlobalConnPool::new(config); + let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(config.endpoint_rps_limit)); // shutdown the connection pool tokio::spawn({ @@ -91,6 +93,7 @@ pub async fn task_main( let sni_name = tls.server_name().map(|s| s.to_string()); let conn_pool = conn_pool.clone(); let ws_connections = ws_connections.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); async move { let peer_addr = match client_addr { @@ -103,6 +106,7 @@ pub async fn task_main( let sni_name = sni_name.clone(); let conn_pool = conn_pool.clone(); let ws_connections = ws_connections.clone(); + let endpoint_rate_limiter = endpoint_rate_limiter.clone(); async move { let cancel_map = Arc::new(CancelMap::default()); @@ -117,6 +121,7 @@ pub async fn task_main( session_id, sni_name, peer_addr.ip(), + endpoint_rate_limiter, ) .instrument(info_span!( "serverless", @@ -190,6 +195,7 @@ async fn request_handler( session_id: uuid::Uuid, sni_hostname: Option, peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> Result, ApiError> { let host = request .headers() @@ -214,6 +220,7 @@ async fn request_handler( session_id, host, peer_addr, + endpoint_rate_limiter, ) .await { diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 199b03550d..cd6184cdee 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -3,6 +3,7 @@ use crate::{ config::ProxyConfig, error::io_error, proxy::{handle_client, ClientMode}, + rate_limiter::EndpointRateLimiter, }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream}; @@ -13,6 +14,7 @@ use pin_project_lite::pin_project; use std::{ net::IpAddr, pin::Pin, + sync::Arc, task::{ready, Context, Poll}, }; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; @@ -134,6 +136,7 @@ pub async fn serve_websocket( session_id: uuid::Uuid, hostname: Option, peer_addr: IpAddr, + endpoint_rate_limiter: Arc, ) -> anyhow::Result<()> { let websocket = websocket.await?; handle_client( @@ -143,6 +146,7 @@ pub async fn serve_websocket( WebSocketRw::new(websocket), ClientMode::Websockets { hostname }, peer_addr, + endpoint_rate_limiter, ) .await?; Ok(())