From 426b1c5f0887f45cc731f8786c457fb02573e0cc Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 10 Oct 2024 12:26:43 +0100 Subject: [PATCH 01/48] storage controller: use 'infra' JWT scope for node registration (#9343) ## Problem Storage controller `/control` API mostly requires admin tokens, for interactive use by engineers. But for endpoints used by scripts, we should not require admin tokens. Discussion at https://neondb.slack.com/archives/C033RQ5SPDH/p1728550081788989?thread_ts=1728548232.265019&cid=C033RQ5SPDH ## Summary of changes - Introduce the 'infra' JWT scope, which was not previously used in the neon repo - For pageserver & safekeeper node registrations, require infra scope instead of admin Note that admin will still work, as the controller auth checks permit admin tokens for all endpoints irrespective of what scope they require. --- libs/utils/src/auth.rs | 5 ++++- pageserver/src/auth.rs | 23 ++++++++++++++--------- safekeeper/src/auth.rs | 23 ++++++++++++++--------- storage_controller/src/http.rs | 4 ++-- 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index 7b735875b7..5bd6f4bedc 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -31,9 +31,12 @@ pub enum Scope { /// The scope used by pageservers in upcalls to storage controller and cloud control plane #[serde(rename = "generations_api")] GenerationsApi, - /// Allows access to control plane managment API and some storage controller endpoints. + /// Allows access to control plane managment API and all storage controller endpoints. Admin, + /// Allows access to control plane & storage controller endpoints used in infrastructure automation (e.g. node registration) + Infra, + /// Allows access to storage controller APIs used by the scrubber, to interrogate the state /// of a tenant & post scrub results. Scrubber, diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs index 9e3dedb75a..5c931fcfdb 100644 --- a/pageserver/src/auth.rs +++ b/pageserver/src/auth.rs @@ -14,14 +14,19 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope - (Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => { - Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Pageserver auth", - claims.scope - ) - .into(), - )) - } + ( + Scope::Admin + | Scope::SafekeeperData + | Scope::GenerationsApi + | Scope::Infra + | Scope::Scrubber, + _, + ) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Pageserver auth", + claims.scope + ) + .into(), + )), } } diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs index c5c9393c00..fdd0830b02 100644 --- a/safekeeper/src/auth.rs +++ b/safekeeper/src/auth.rs @@ -15,15 +15,20 @@ pub fn check_permission(claims: &Claims, tenant_id: Option) -> Result< } Ok(()) } - (Scope::Admin | Scope::PageServerApi | Scope::GenerationsApi | Scope::Scrubber, _) => { - Err(AuthError( - format!( - "JWT scope '{:?}' is ineligible for Safekeeper auth", - claims.scope - ) - .into(), - )) - } + ( + Scope::Admin + | Scope::PageServerApi + | Scope::GenerationsApi + | Scope::Infra + | Scope::Scrubber, + _, + ) => Err(AuthError( + format!( + "JWT scope '{:?}' is ineligible for Safekeeper auth", + claims.scope + ) + .into(), + )), (Scope::SafekeeperData, _) => Ok(()), } } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 4dd8badd03..46b6f4f2bf 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -636,7 +636,7 @@ async fn handle_tenant_list( } async fn handle_node_register(req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Infra)?; let mut req = match maybe_forward(req).await { ForwardOutcome::Forwarded(res) => { @@ -1182,7 +1182,7 @@ async fn handle_get_safekeeper(req: Request) -> Result, Api /// Assumes information is only relayed to storage controller after first selecting an unique id on /// control plane database, which means we have an id field in the request and payload. async fn handle_upsert_safekeeper(mut req: Request) -> Result, ApiError> { - check_permissions(&req, Scope::Admin)?; + check_permissions(&req, Scope::Infra)?; let body = json_request::(&mut req).await?; let id = parse_request_param::(&req, "id")?; From c2623ffef454378b2602f494e459b32028aa04a0 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 10 Oct 2024 12:40:35 +0100 Subject: [PATCH 02/48] CODEOWNERS: assign `storage_scrubber` to storage (#9346) --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/CODEOWNERS b/CODEOWNERS index 606dbb4e22..f8ed4be816 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,5 +1,6 @@ /compute_tools/ @neondatabase/control-plane @neondatabase/compute /storage_controller @neondatabase/storage +/storage_scrubber @neondatabase/storage /libs/pageserver_api/ @neondatabase/storage /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage /libs/remote_storage/ @neondatabase/storage From 9dd80b9b4ce94addc0acc6200be22cd7b09ba562 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 10 Oct 2024 14:09:53 +0100 Subject: [PATCH 03/48] storage_scrubber: fix faulty assertion when no timelines (#9345) When there are no timelines in remote storage, the storage scrubber would incorrectly trip an assertion with "Must be set if results are present", referring to the last processed tenant ID. When there are no timelines we don't expect there to be a tenant ID either. The assertion was introduced in 37aa6fd. Only apply the assertion when any timelines are present. --- storage_scrubber/src/scan_pageserver_metadata.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index c1ea589f7f..cb3299d413 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -317,9 +317,8 @@ pub async fn scan_pageserver_metadata( tenant_timeline_results.push((ttid, data)); } - let tenant_id = tenant_id.expect("Must be set if results are present"); - if !tenant_timeline_results.is_empty() { + let tenant_id = tenant_id.expect("Must be set if results are present"); analyze_tenant( &remote_client, tenant_id, From 264c34dfb7b90e619677fc04fdb957b270c40e8e Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Thu, 10 Oct 2024 10:26:23 -0500 Subject: [PATCH 04/48] Move path-related fixtures into their own module (#9304) neon_fixtures.py has grown into quite a beast. Signed-off-by: Tristan Partin --- test_runner/conftest.py | 1 + test_runner/fixtures/neon_fixtures.py | 240 +---------------------- test_runner/fixtures/paths.py | 269 ++++++++++++++++++++++++++ 3 files changed, 273 insertions(+), 237 deletions(-) create mode 100644 test_runner/fixtures/paths.py diff --git a/test_runner/conftest.py b/test_runner/conftest.py index d6e7fcf7ca..4a3194c691 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -6,6 +6,7 @@ pytest_plugins = ( "fixtures.httpserver", "fixtures.compute_reconfigure", "fixtures.storage_controller_proxy", + "fixtures.paths", "fixtures.neon_fixtures", "fixtures.benchmark_fixture", "fixtures.pg_stats", diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index f81bc3f5a6..9a60de922c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -18,7 +18,6 @@ from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime from enum import Enum -from fcntl import LOCK_EX, LOCK_UN, flock from functools import cached_property from pathlib import Path from types import TracebackType @@ -59,6 +58,7 @@ from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( wait_for_last_record_lsn, ) +from fixtures.paths import get_test_repo_dir, shared_snapshot_dir from fixtures.pg_version import PgVersion from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import ( @@ -76,7 +76,6 @@ from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, allure_add_grafana_links, - allure_attach_from_dir, assert_no_errors, get_dir_size, print_gc_result, @@ -96,6 +95,8 @@ if TYPE_CHECKING: Union, ) + from fixtures.paths import SnapshotDirLocked + T = TypeVar("T") @@ -118,65 +119,11 @@ put directly-importable functions into utils.py or another separate file. Env = dict[str, str] -DEFAULT_OUTPUT_DIR: str = "test_output" DEFAULT_BRANCH_NAME: str = "main" BASE_PORT: int = 15000 -@pytest.fixture(scope="session") -def base_dir() -> Iterator[Path]: - # find the base directory (currently this is the git root) - base_dir = Path(__file__).parents[2] - log.info(f"base_dir is {base_dir}") - - yield base_dir - - -@pytest.fixture(scope="function") -def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: - if os.getenv("REMOTE_ENV"): - # we are in remote env and do not have neon binaries locally - # this is the case for benchmarks run on self-hosted runner - return - - # Find the neon binaries. - if env_neon_bin := os.environ.get("NEON_BIN"): - binpath = Path(env_neon_bin) - else: - binpath = base_dir / "target" / build_type - log.info(f"neon_binpath is {binpath}") - - if not (binpath / "pageserver").exists(): - raise Exception(f"neon binaries not found at '{binpath}'") - - yield binpath - - -@pytest.fixture(scope="session") -def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: - if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): - distrib_dir = Path(env_postgres_bin).resolve() - else: - distrib_dir = base_dir / "pg_install" - - log.info(f"pg_distrib_dir is {distrib_dir}") - yield distrib_dir - - -@pytest.fixture(scope="session") -def top_output_dir(base_dir: Path) -> Iterator[Path]: - # Compute the top-level directory for all tests. - if env_test_output := os.environ.get("TEST_OUTPUT"): - output_dir = Path(env_test_output).resolve() - else: - output_dir = base_dir / DEFAULT_OUTPUT_DIR - output_dir.mkdir(exist_ok=True) - - log.info(f"top_output_dir is {output_dir}") - yield output_dir - - @pytest.fixture(scope="session") def neon_api_key() -> str: api_key = os.getenv("NEON_API_KEY") @@ -4246,44 +4193,6 @@ class StorageScrubber: raise -def _get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str) -> Path: - """Compute the path to a working directory for an individual test.""" - test_name = request.node.name - test_dir = top_output_dir / f"{prefix}{test_name.replace('/', '-')}" - - # We rerun flaky tests multiple times, use a separate directory for each run. - if (suffix := getattr(request.node, "execution_count", None)) is not None: - test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" - - log.info(f"get_test_output_dir is {test_dir}") - # make mypy happy - assert isinstance(test_dir, Path) - return test_dir - - -def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - """ - The working directory for a test. - """ - return _get_test_dir(request, top_output_dir, "") - - -def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - """ - Directory that contains `upperdir` and `workdir` for overlayfs mounts - that a test creates. See `NeonEnvBuilder.overlay_mount`. - """ - return _get_test_dir(request, top_output_dir, "overlay-") - - -def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path: - return top_output_dir / "shared-snapshots" / snapshot_name - - -def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: - return get_test_output_dir(request, top_output_dir) / "repo" - - def pytest_addoption(parser: Parser): parser.addoption( "--preserve-database-files", @@ -4298,149 +4207,6 @@ SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile( ) -# This is autouse, so the test output directory always gets created, even -# if a test doesn't put anything there. -# -# NB: we request the overlay dir fixture so the fixture does its cleanups -@pytest.fixture(scope="function", autouse=True) -def test_output_dir( - request: FixtureRequest, top_output_dir: Path, test_overlay_dir: Path -) -> Iterator[Path]: - """Create the working directory for an individual test.""" - - # one directory per test - test_dir = get_test_output_dir(request, top_output_dir) - log.info(f"test_output_dir is {test_dir}") - shutil.rmtree(test_dir, ignore_errors=True) - test_dir.mkdir() - - yield test_dir - - # Allure artifacts creation might involve the creation of `.tar.zst` archives, - # which aren't going to be used if Allure results collection is not enabled - # (i.e. --alluredir is not set). - # Skip `allure_attach_from_dir` in this case - if not request.config.getoption("--alluredir"): - return - - preserve_database_files = False - for k, v in request.node.user_properties: - # NB: the neon_env_builder fixture uses this fixture (test_output_dir). - # So, neon_env_builder's cleanup runs before here. - # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. - if k == "preserve_database_files": - assert isinstance(v, bool) - preserve_database_files = v - - allure_attach_from_dir(test_dir, preserve_database_files) - - -class FileAndThreadLock: - def __init__(self, path: Path): - self.path = path - self.thread_lock = threading.Lock() - self.fd: Optional[int] = None - - def __enter__(self): - self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) - # lock thread lock before file lock so that there's no race - # around flocking / funlocking the file lock - self.thread_lock.acquire() - flock(self.fd, LOCK_EX) - - def __exit__(self, exc_type, exc_value, exc_traceback): - assert self.fd is not None - assert self.thread_lock.locked() # ... by us - flock(self.fd, LOCK_UN) - self.thread_lock.release() - os.close(self.fd) - self.fd = None - - -class SnapshotDirLocked: - def __init__(self, parent: SnapshotDir): - self._parent = parent - - def is_initialized(self): - # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized. - # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed. - return self._parent._marker_file_path.exists() - - def set_initialized(self): - self._parent._marker_file_path.write_text("") - - @property - def path(self) -> Path: - return self._parent._path / "snapshot" - - -class SnapshotDir: - _path: Path - - def __init__(self, path: Path): - self._path = path - assert self._path.is_dir() - self._lock = FileAndThreadLock(self._lock_file_path) - - @property - def _lock_file_path(self) -> Path: - return self._path / "initializing.flock" - - @property - def _marker_file_path(self) -> Path: - return self._path / "initialized.marker" - - def __enter__(self) -> SnapshotDirLocked: - self._lock.__enter__() - return SnapshotDirLocked(self) - - def __exit__(self, exc_type, exc_value, exc_traceback): - self._lock.__exit__(exc_type, exc_value, exc_traceback) - - -def shared_snapshot_dir(top_output_dir, ident: str) -> SnapshotDir: - snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident) - snapshot_dir_path.mkdir(exist_ok=True, parents=True) - return SnapshotDir(snapshot_dir_path) - - -@pytest.fixture(scope="function") -def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: - """ - Idempotently create a test's overlayfs mount state directory. - If the functionality isn't enabled via env var, returns None. - - The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). - """ - - if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None: - return None - - overlay_dir = get_test_overlay_dir(request, top_output_dir) - log.info(f"test_overlay_dir is {overlay_dir}") - - overlay_dir.mkdir(exist_ok=True) - # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` - for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): - cmd = ["sudo", "umount", str(mountpoint)] - log.info( - f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}" - ) - subprocess.run(cmd, capture_output=True, check=True) - # the overlayfs `workdir`` is owned by `root`, shutil.rmtree won't work. - cmd = ["sudo", "rm", "-rf", str(overlay_dir)] - subprocess.run(cmd, capture_output=True, check=True) - - overlay_dir.mkdir() - - return overlay_dir - - # no need to clean up anything: on clean shutdown, - # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup - # and on unclean shutdown, this function will take care of it - # on the next test run - - SKIP_DIRS = frozenset( ( "pg_wal", diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py new file mode 100644 index 0000000000..0712d241db --- /dev/null +++ b/test_runner/fixtures/paths.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import os +import shutil +import subprocess +import threading +from fcntl import LOCK_EX, LOCK_UN, flock +from pathlib import Path +from types import TracebackType +from typing import TYPE_CHECKING + +import pytest +from pytest import FixtureRequest + +from fixtures import overlayfs +from fixtures.log_helper import log +from fixtures.utils import allure_attach_from_dir + +if TYPE_CHECKING: + from collections.abc import Iterator + from typing import Optional + + +DEFAULT_OUTPUT_DIR: str = "test_output" + + +def get_test_dir( + request: FixtureRequest, top_output_dir: Path, prefix: Optional[str] = None +) -> Path: + """Compute the path to a working directory for an individual test.""" + test_name = request.node.name + test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}" + + # We rerun flaky tests multiple times, use a separate directory for each run. + if (suffix := getattr(request.node, "execution_count", None)) is not None: + test_dir = test_dir.parent / f"{test_dir.name}-{suffix}" + + return test_dir + + +def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + The working directory for a test. + """ + return get_test_dir(request, top_output_dir) + + +def get_test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + """ + Directory that contains `upperdir` and `workdir` for overlayfs mounts + that a test creates. See `NeonEnvBuilder.overlay_mount`. + """ + return get_test_dir(request, top_output_dir, "overlay-") + + +def get_shared_snapshot_dir_path(top_output_dir: Path, snapshot_name: str) -> Path: + return top_output_dir / "shared-snapshots" / snapshot_name + + +def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path: + return get_test_output_dir(request, top_output_dir) / "repo" + + +@pytest.fixture(scope="session") +def base_dir() -> Iterator[Path]: + # find the base directory (currently this is the git root) + base_dir = Path(__file__).parents[2] + log.info(f"base_dir is {base_dir}") + + yield base_dir + + +@pytest.fixture(scope="function") +def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: + if os.getenv("REMOTE_ENV"): + # we are in remote env and do not have neon binaries locally + # this is the case for benchmarks run on self-hosted runner + return + + # Find the neon binaries. + if env_neon_bin := os.environ.get("NEON_BIN"): + binpath = Path(env_neon_bin) + else: + binpath = base_dir / "target" / build_type + log.info(f"neon_binpath is {binpath}") + + if not (binpath / "pageserver").exists(): + raise Exception(f"neon binaries not found at '{binpath}'") + + yield binpath + + +@pytest.fixture(scope="session") +def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: + if env_postgres_bin := os.environ.get("POSTGRES_DISTRIB_DIR"): + distrib_dir = Path(env_postgres_bin).resolve() + else: + distrib_dir = base_dir / "pg_install" + + log.info(f"pg_distrib_dir is {distrib_dir}") + yield distrib_dir + + +@pytest.fixture(scope="session") +def top_output_dir(base_dir: Path) -> Iterator[Path]: + # Compute the top-level directory for all tests. + if env_test_output := os.environ.get("TEST_OUTPUT"): + output_dir = Path(env_test_output).resolve() + else: + output_dir = base_dir / DEFAULT_OUTPUT_DIR + output_dir.mkdir(exist_ok=True) + + log.info(f"top_output_dir is {output_dir}") + yield output_dir + + +# This is autouse, so the test output directory always gets created, even +# if a test doesn't put anything there. +# +# NB: we request the overlay dir fixture so the fixture does its cleanups +@pytest.fixture(scope="function", autouse=True) +def test_output_dir(request: pytest.FixtureRequest, top_output_dir: Path) -> Iterator[Path]: + """Create the working directory for an individual test.""" + + # one directory per test + test_dir = get_test_output_dir(request, top_output_dir) + log.info(f"test_output_dir is {test_dir}") + shutil.rmtree(test_dir, ignore_errors=True) + test_dir.mkdir() + + yield test_dir + + # Allure artifacts creation might involve the creation of `.tar.zst` archives, + # which aren't going to be used if Allure results collection is not enabled + # (i.e. --alluredir is not set). + # Skip `allure_attach_from_dir` in this case + if not request.config.getoption("--alluredir"): + return + + preserve_database_files = False + for k, v in request.node.user_properties: + # NB: the neon_env_builder fixture uses this fixture (test_output_dir). + # So, neon_env_builder's cleanup runs before here. + # The cleanup propagates NeonEnvBuilder.preserve_database_files into this user property. + if k == "preserve_database_files": + assert isinstance(v, bool) + preserve_database_files = v + + allure_attach_from_dir(test_dir, preserve_database_files) + + +class FileAndThreadLock: + def __init__(self, path: Path): + self.path = path + self.thread_lock = threading.Lock() + self.fd: Optional[int] = None + + def __enter__(self): + self.fd = os.open(self.path, os.O_CREAT | os.O_WRONLY) + # lock thread lock before file lock so that there's no race + # around flocking / funlocking the file lock + self.thread_lock.acquire() + flock(self.fd, LOCK_EX) + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_value: Optional[BaseException], + exc_traceback: Optional[TracebackType], + ): + assert self.fd is not None + assert self.thread_lock.locked() # ... by us + flock(self.fd, LOCK_UN) + self.thread_lock.release() + os.close(self.fd) + self.fd = None + + +class SnapshotDirLocked: + def __init__(self, parent: SnapshotDir): + self._parent = parent + + def is_initialized(self): + # TODO: in the future, take a `tag` as argument and store it in the marker in set_initialized. + # Then, in this function, compare marker file contents with the tag to invalidate the snapshot if the tag changed. + return self._parent.marker_file_path.exists() + + def set_initialized(self): + self._parent.marker_file_path.write_text("") + + @property + def path(self) -> Path: + return self._parent.path / "snapshot" + + +class SnapshotDir: + _path: Path + + def __init__(self, path: Path): + self._path = path + assert self._path.is_dir() + self._lock = FileAndThreadLock(self.lock_file_path) + + @property + def path(self) -> Path: + return self._path + + @property + def lock_file_path(self) -> Path: + return self._path / "initializing.flock" + + @property + def marker_file_path(self) -> Path: + return self._path / "initialized.marker" + + def __enter__(self) -> SnapshotDirLocked: + self._lock.__enter__() + return SnapshotDirLocked(self) + + def __exit__( + self, + exc_type: Optional[type[BaseException]], + exc_value: Optional[BaseException], + exc_traceback: Optional[TracebackType], + ): + self._lock.__exit__(exc_type, exc_value, exc_traceback) + + +def shared_snapshot_dir(top_output_dir: Path, ident: str) -> SnapshotDir: + snapshot_dir_path = get_shared_snapshot_dir_path(top_output_dir, ident) + snapshot_dir_path.mkdir(exist_ok=True, parents=True) + return SnapshotDir(snapshot_dir_path) + + +@pytest.fixture(scope="function") +def test_overlay_dir(request: FixtureRequest, top_output_dir: Path) -> Optional[Path]: + """ + Idempotently create a test's overlayfs mount state directory. + If the functionality isn't enabled via env var, returns None. + + The procedure cleans up after previous runs that were aborted (e.g. due to Ctrl-C, OOM kills, etc). + """ + + if os.getenv("NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS") is None: + return None + + overlay_dir = get_test_overlay_dir(request, top_output_dir) + log.info(f"test_overlay_dir is {overlay_dir}") + + overlay_dir.mkdir(exist_ok=True) + # unmount stale overlayfs mounts which subdirectories of `overlay_dir/*` as the overlayfs `upperdir` and `workdir` + for mountpoint in overlayfs.iter_mounts_beneath(get_test_output_dir(request, top_output_dir)): + cmd = ["sudo", "umount", str(mountpoint)] + log.info( + f"Unmounting stale overlayfs mount probably created during earlier test run: {cmd}" + ) + subprocess.run(cmd, capture_output=True, check=True) + # the overlayfs `workdir`` is owned by `root`, shutil.rmtree won't work. + cmd = ["sudo", "rm", "-rf", str(overlay_dir)] + subprocess.run(cmd, capture_output=True, check=True) + + overlay_dir.mkdir() + + return overlay_dir + + # no need to clean up anything: on clean shutdown, + # NeonEnvBuilder.overlay_cleanup_teardown takes care of cleanup + # and on unclean shutdown, this function will take care of it + # on the next test run From 07c714343f793eeb866232e23b4c1c7409fa7f61 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 10 Oct 2024 17:06:42 +0100 Subject: [PATCH 05/48] tests: allow a log warning in test_cli_start_stop_multi (#9320) ## Problem This test restarts services in an undefined order (whatever neon_local does), which means we should be tolerant of warnings that come from restarting the storage controller while a pageserver is running. We can see failures with warnings from dropped requests, e.g. https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9307/11229000712/index.html#/testresult/d33d5cb206331e28 ``` WARN request{method=GET path=/v1/location_config request_id=b7dbda15-6efb-4610-8b19-a3772b65455f}: request was dropped before completing\n') ``` ## Summary of changes - allow-list the `request was dropped before completing` message on pageservers before restarting services --- test_runner/regress/test_neon_cli.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 3a0a4b10bf..783fb813cf 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -162,6 +162,11 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID) env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1) + # We will stop the storage controller while it may have requests in + # flight, and the pageserver complains when requests are abandoned. + for ps in env.pageservers: + ps.allowed_errors.append(".*request was dropped before completing.*") + # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageservers[0].running = False env.pageservers[1].running = False From 1f7904c917503a95f6297ae9df705e22fd5daba4 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Thu, 10 Oct 2024 12:40:30 -0500 Subject: [PATCH 06/48] Enable cargo caching in check-codestyle-rust This job takes an extraordinary amount of time for what I understand it to do. The obvious win is caching dependencies. Rory disabled caching in cd5732d9d8ccd291f39ed41250072acdce3012e6. I assume this was to get gen3 runners up and running. Signed-off-by: Tristan Partin --- .github/workflows/build_and_test.yml | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a759efb56c..e7193cfe19 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -193,16 +193,15 @@ jobs: with: submodules: true -# Disabled for now -# - name: Restore cargo deps cache -# id: cache_cargo -# uses: actions/cache@v4 -# with: -# path: | -# !~/.cargo/registry/src -# ~/.cargo/git/ -# target/ -# key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} + - name: Cache cargo deps + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + !~/.cargo/registry/src + ~/.cargo/git + target + key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers From 006d9dfb6bde9473c14719cab8ecebec77dd65c7 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Thu, 10 Oct 2024 12:43:40 -0500 Subject: [PATCH 07/48] Add compute_config_dir fixture Allows easy access to various compute config files. Signed-off-by: Tristan Partin --- test_runner/fixtures/paths.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index 0712d241db..cffeb47ee8 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -70,6 +70,14 @@ def base_dir() -> Iterator[Path]: yield base_dir +@pytest.fixture(scope="session") +def compute_config_dir(base_dir: Path) -> Iterator[Path]: + """ + Retrieve the path to the compute configuration directory. + """ + yield base_dir / "compute" / "etc" + + @pytest.fixture(scope="function") def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: if os.getenv("REMOTE_ENV"): From 53147b51f90ba854605e49edd28b7d7895930c92 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Thu, 10 Oct 2024 13:00:25 -0500 Subject: [PATCH 08/48] Use valid type hints for Python 3.9 I have no idea how this made it past the linters. Signed-off-by: Tristan Partin --- test_runner/fixtures/neon_api.py | 6 +++--- test_runner/fixtures/pageserver/http.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py index 683ea3af44..5934baccff 100644 --- a/test_runner/fixtures/neon_api.py +++ b/test_runner/fixtures/neon_api.py @@ -185,8 +185,8 @@ class NeonAPI: def get_connection_uri( self, project_id: str, - branch_id: str | None = None, - endpoint_id: str | None = None, + branch_id: Optional[str] = None, + endpoint_id: Optional[str] = None, database_name: str = "neondb", role_name: str = "neondb_owner", pooled: bool = True, @@ -262,7 +262,7 @@ class NeonAPI: class NeonApiEndpoint: - def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: str | None): + def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]): self.neon_api = neon_api if project_id is None: project = neon_api.create_project(pg_version) diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 84a7e5f0a2..aa4435af4e 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -886,7 +886,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, - batch_size: int | None = None, + batch_size: Optional[int] = None, **kwargs, ) -> set[TimelineId]: params = {} From b2ecbf3e80804123b216cb3242d0e165936db120 Mon Sep 17 00:00:00 2001 From: Ivan Efremov Date: Fri, 11 Oct 2024 10:45:55 +0300 Subject: [PATCH 09/48] Introduce "quota" ErrorKind (#9300) ## Problem Fixes #8340 ## Summary of changes Introduced ErrorKind::quota to handle quota-related errors ## Checklist before requesting a review - [x] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --- proxy/src/control_plane/provider/mod.rs | 16 ++++++++-------- proxy/src/error.rs | 5 +++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index 01d93dee43..6cc525a324 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -81,12 +81,12 @@ pub(crate) mod errors { Reason::EndpointNotFound => ErrorKind::User, Reason::BranchNotFound => ErrorKind::User, Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit, - Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::User, - Reason::ActiveTimeQuotaExceeded => ErrorKind::User, - Reason::ComputeTimeQuotaExceeded => ErrorKind::User, - Reason::WrittenDataQuotaExceeded => ErrorKind::User, - Reason::DataTransferQuotaExceeded => ErrorKind::User, - Reason::LogicalSizeQuotaExceeded => ErrorKind::User, + Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota, + Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota, + Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota, + Reason::WrittenDataQuotaExceeded => ErrorKind::Quota, + Reason::DataTransferQuotaExceeded => ErrorKind::Quota, + Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota, Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, Reason::LockAlreadyTaken => ErrorKind::ControlPlane, Reason::RunningOperations => ErrorKind::ControlPlane, @@ -103,7 +103,7 @@ pub(crate) mod errors { } if error .contains("compute time quota of non-primary branches is exceeded") => { - crate::error::ErrorKind::User + crate::error::ErrorKind::Quota } ControlPlaneError { http_status_code: http::StatusCode::LOCKED, @@ -112,7 +112,7 @@ pub(crate) mod errors { } if error.contains("quota exceeded") || error.contains("the limit for current plan reached") => { - crate::error::ErrorKind::User + crate::error::ErrorKind::Quota } ControlPlaneError { http_status_code: http::StatusCode::TOO_MANY_REQUESTS, diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 53f9f75c5b..1cd4dc2c22 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -49,6 +49,10 @@ pub enum ErrorKind { #[label(rename = "serviceratelimit")] ServiceRateLimit, + /// Proxy quota limit violation + #[label(rename = "quota")] + Quota, + /// internal errors Service, @@ -70,6 +74,7 @@ impl ErrorKind { ErrorKind::ClientDisconnect => "clientdisconnect", ErrorKind::RateLimit => "ratelimit", ErrorKind::ServiceRateLimit => "serviceratelimit", + ErrorKind::Quota => "quota", ErrorKind::Service => "service", ErrorKind::ControlPlane => "controlplane", ErrorKind::Postgres => "postgres", From 184935619e55bbd9c025b5a057f36362b1a60dd2 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 11 Oct 2024 09:41:08 +0100 Subject: [PATCH 10/48] tests: stabilize test_storage_controller_heartbeats (#9347) ## Problem This could fail with `reconciliation in progress` if running on a slow test node such that background reconciliation happens at the same time as we call consistency_check. Example: https://neon-github-public-dev.s3.amazonaws.com/reports/main/11258171952/index.html#/testresult/54889c9469afb232 ## Summary of changes - Call reconcile_until_idle before calling consistency check once, rather than calling consistency check until it passes --- test_runner/regress/test_storage_controller.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 7be4d2ce0c..1dcf0b254d 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -1300,11 +1300,11 @@ def test_storage_controller_heartbeats( node_to_tenants = build_node_to_tenants_map(env) log.info(f"Back online: {node_to_tenants=}") - # ... expecting the storage controller to reach a consistent state - def storage_controller_consistent(): - env.storage_controller.consistency_check() + # ... background reconciliation may need to run to clean up the location on the node that was offline + env.storage_controller.reconcile_until_idle() - wait_until(30, 1, storage_controller_consistent) + # ... expecting the storage controller to reach a consistent state + env.storage_controller.consistency_check() def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): From 6baf1aae3315c10b20f8e5e27239d3604484b895 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Fri, 11 Oct 2024 11:29:08 +0200 Subject: [PATCH 11/48] proxy: Demote some errors to warnings in logs (#9354) --- proxy/src/control_plane/provider/neon.rs | 4 ++-- proxy/src/proxy/mod.rs | 12 ++++++------ proxy/src/proxy/passthrough.rs | 2 +- .../redis/connection_with_credentials_provider.rs | 6 +++--- proxy/src/redis/notifications.rs | 2 +- proxy/src/serverless/mod.rs | 6 +++--- proxy/src/serverless/sql_over_http.rs | 4 ++-- proxy/src/usage_metrics.rs | 4 ++-- 8 files changed, 20 insertions(+), 20 deletions(-) diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index e5f8b5c741..d01878741c 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -22,7 +22,7 @@ use futures::TryFutureExt; use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; -use tracing::{debug, error, info, info_span, warn, Instrument}; +use tracing::{debug, info, info_span, warn, Instrument}; const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); @@ -456,7 +456,7 @@ async fn parse_body serde::Deserialize<'a>>( }); body.http_status_code = status; - error!("console responded with an error ({status}): {body:?}"); + warn!("console responded with an error ({status}): {body:?}"); Err(ApiError::ControlPlane(body)) } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 7003af2aba..9e1af88f41 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -35,7 +35,7 @@ use std::sync::Arc; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, Instrument}; +use tracing::{error, info, warn, Instrument}; use self::{ connect_compute::{connect_to_compute, TcpMechanism}, @@ -95,15 +95,15 @@ pub async fn task_main( connections.spawn(async move { let (socket, peer_addr) = match read_proxy_protocol(socket).await { Err(e) => { - error!("per-client task finished with an error: {e:#}"); + warn!("per-client task finished with an error: {e:#}"); return; } Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { - error!("missing required proxy protocol header"); + warn!("missing required proxy protocol header"); return; } Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { - error!("proxy protocol header not supported"); + warn!("proxy protocol header not supported"); return; } Ok((socket, Some(addr))) => (socket, addr.ip()), @@ -144,7 +144,7 @@ pub async fn task_main( Err(e) => { // todo: log and push to ctx the error kind ctx.set_error_kind(e.get_error_kind()); - error!(parent: &span, "per-client task finished with an error: {e:#}"); + warn!(parent: &span, "per-client task finished with an error: {e:#}"); } Ok(None) => { ctx.set_success(); @@ -155,7 +155,7 @@ pub async fn task_main( match p.proxy_pass().instrument(span.clone()).await { Ok(()) => {} Err(ErrorSource::Client(e)) => { - error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + warn!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); } Err(ErrorSource::Compute(e)) => { error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index bbea47f8af..497cf4bfd5 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -71,7 +71,7 @@ impl ProxyPassthrough { pub(crate) async fn proxy_pass(self) -> Result<(), ErrorSource> { let res = proxy_pass(self.client, self.compute.stream, self.aux).await; if let Err(err) = self.compute.cancel_closure.try_cancel_query().await { - tracing::error!(?err, "could not cancel the query in the database"); + tracing::warn!(?err, "could not cancel the query in the database"); } res } diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index 2de66b58b1..ccd48f1481 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -6,7 +6,7 @@ use redis::{ ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, }; use tokio::task::JoinHandle; -use tracing::{debug, error, info}; +use tracing::{debug, error, info, warn}; use super::elasticache::CredentialsProvider; @@ -89,7 +89,7 @@ impl ConnectionWithCredentialsProvider { return Ok(()); } Err(e) => { - error!("Error during PING: {e:?}"); + warn!("Error during PING: {e:?}"); } } } else { @@ -121,7 +121,7 @@ impl ConnectionWithCredentialsProvider { info!("Connection succesfully established"); } Err(e) => { - error!("Connection is broken. Error during PING: {e:?}"); + warn!("Connection is broken. Error during PING: {e:?}"); } } self.con = Some(con); diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index 36a3443603..c3af6740cb 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -146,7 +146,7 @@ impl MessageHandler { { Ok(()) => {} Err(e) => { - tracing::error!("failed to cancel session: {e}"); + tracing::warn!("failed to cancel session: {e}"); } } } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 9be6b592bd..b5820b0535 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -48,7 +48,7 @@ use std::pin::{pin, Pin}; use std::sync::Arc; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn, Instrument}; +use tracing::{info, warn, Instrument}; use utils::http::error::ApiError; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; @@ -241,7 +241,7 @@ async fn connection_startup( let (conn, peer) = match read_proxy_protocol(conn).await { Ok(c) => c, Err(e) => { - tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); + tracing::warn!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}"); return None; } }; @@ -405,7 +405,7 @@ async fn request_handler( ) .await { - error!("error in websocket connection: {e:#}"); + warn!("error in websocket connection: {e:#}"); } } .instrument(span), diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index f7c3b26917..646e7f8a52 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -831,7 +831,7 @@ impl QueryData { Either::Right((_cancelled, query)) => { tracing::info!("cancelling query"); if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); + tracing::warn!(?err, "could not cancel query"); } // wait for the query cancellation match time::timeout(time::Duration::from_millis(100), query).await { @@ -920,7 +920,7 @@ impl BatchQueryData { } Err(SqlOverHttpError::Cancelled(_)) => { if let Err(err) = cancel_token.cancel_query(NoTls).await { - tracing::error!(?err, "could not cancel query"); + tracing::warn!(?err, "could not cancel query"); } // TODO: after cancelling, wait to see if we can get a status. maybe the connection is still safe. discard.discard(); diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index bd3e62bc12..ee36ed462d 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -27,7 +27,7 @@ use std::{ }; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; -use tracing::{error, info, instrument, trace}; +use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; @@ -346,7 +346,7 @@ async fn collect_metrics_iteration( error!("metrics endpoint refused the sent metrics: {:?}", res); for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) { // Report if the metric value is suspiciously large - error!("potentially abnormal metric value: {:?}", metric); + warn!("potentially abnormal metric value: {:?}", metric); } } } From 326cd80f0dd8b60e5780d184bd55dab769a9f0b1 Mon Sep 17 00:00:00 2001 From: Fedor Dikarev Date: Fri, 11 Oct 2024 14:46:45 +0200 Subject: [PATCH 12/48] ci: gh-workflow-stats-action v0.1.4: remove debug output and proper pagination (#9356) ## Problem In previous version pagination didn't work so we collect information only for first 30 jobs in WorkflowRun --- .github/workflows/report-workflow-stats.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml index 1afe896600..6abeff7695 100644 --- a/.github/workflows/report-workflow-stats.yml +++ b/.github/workflows/report-workflow-stats.yml @@ -33,7 +33,7 @@ jobs: actions: read steps: - name: Export GH Workflow Stats - uses: fedordikarev/gh-workflow-stats-action@v0.1.2 + uses: neondatabase/gh-workflow-stats-action@v0.1.4 with: DB_URI: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }} DB_TABLE: "gh_workflow_stats_neon" From 091a175a3e02d319468efe15bb9765a3a9e29f4b Mon Sep 17 00:00:00 2001 From: a-masterov <72613290+a-masterov@users.noreply.github.com> Date: Fri, 11 Oct 2024 15:29:54 +0200 Subject: [PATCH 13/48] Test versions mismatch (#9167) ## Problem We faced the problem of incompatibility of the different components of different versions. This should be detected automatically to prevent production bugs. ## Summary of changes The test for this situation was implemented Co-authored-by: Alexander Bayandin --- test_runner/README.md | 12 +++ test_runner/fixtures/neon_fixtures.py | 52 +++++++++++ test_runner/fixtures/paths.py | 37 +++++++- test_runner/fixtures/utils.py | 33 +++++++ test_runner/regress/test_compatibility.py | 91 +++++++++++++------ .../regress/test_storage_controller.py | 12 ++- 6 files changed, 206 insertions(+), 31 deletions(-) diff --git a/test_runner/README.md b/test_runner/README.md index d754e60d17..e087241c1f 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -64,10 +64,12 @@ By default performance tests are excluded. To run them explicitly pass performan Useful environment variables: `NEON_BIN`: The directory where neon binaries can be found. +`COMPATIBILITY_NEON_BIN`: The directory where the previous version of Neon binaries can be found `POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found. Since pageserver supports several postgres versions, `POSTGRES_DISTRIB_DIR` must contain a subdirectory for each version with naming convention `v{PG_VERSION}/`. Inside that dir, a `bin/postgres` binary should be present. +`COMPATIBILITY_POSTGRES_DISTRIB_DIR`: The directory where the prevoius version of postgres distribution can be found. `DEFAULT_PG_VERSION`: The version of Postgres to use, This is used to construct full path to the postgres binaries. Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16` @@ -294,6 +296,16 @@ def test_foobar2(neon_env_builder: NeonEnvBuilder): client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id) ``` +All the test which rely on NeonEnvBuilder, can check the various version combinations of the components. +To do this yuo may want to add the parametrize decorator with the function fixtures.utils.allpairs_versions() +E.g. + +```python +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +def test_something( +... +``` + For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html At the end of a test, all the nodes in the environment are automatically stopped, so you diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 9a60de922c..7789855fe4 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -75,6 +75,7 @@ from fixtures.safekeeper.http import SafekeeperHttpClient from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, + COMPONENT_BINARIES, allure_add_grafana_links, assert_no_errors, get_dir_size, @@ -316,11 +317,14 @@ class NeonEnvBuilder: run_id: uuid.UUID, mock_s3_server: MockS3Server, neon_binpath: Path, + compatibility_neon_binpath: Path, pg_distrib_dir: Path, + compatibility_pg_distrib_dir: Path, pg_version: PgVersion, test_name: str, top_output_dir: Path, test_output_dir: Path, + combination, test_overlay_dir: Optional[Path] = None, pageserver_remote_storage: Optional[RemoteStorage] = None, # toml that will be decomposed into `--config-override` flags during `pageserver --init` @@ -402,6 +406,19 @@ class NeonEnvBuilder: "test_" ), "Unexpectedly instantiated from outside a test function" self.test_name = test_name + self.compatibility_neon_binpath = compatibility_neon_binpath + self.compatibility_pg_distrib_dir = compatibility_pg_distrib_dir + self.version_combination = combination + self.mixdir = self.test_output_dir / "mixdir_neon" + if self.version_combination is not None: + assert ( + self.compatibility_neon_binpath is not None + ), "the environment variable COMPATIBILITY_NEON_BIN is required when using mixed versions" + assert ( + self.compatibility_pg_distrib_dir is not None + ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required when using mixed versions" + self.mixdir.mkdir(mode=0o755, exist_ok=True) + self._mix_versions() def init_configs(self, default_remote_storage_if_missing: bool = True) -> NeonEnv: # Cannot create more than one environment from one builder @@ -602,6 +619,21 @@ class NeonEnvBuilder: return self.env + def _mix_versions(self): + assert self.version_combination is not None, "version combination must be set" + for component, paths in COMPONENT_BINARIES.items(): + directory = ( + self.neon_binpath + if self.version_combination[component] == "new" + else self.compatibility_neon_binpath + ) + for filename in paths: + destination = self.mixdir / filename + destination.symlink_to(directory / filename) + if self.version_combination["compute"] == "old": + self.pg_distrib_dir = self.compatibility_pg_distrib_dir + self.neon_binpath = self.mixdir + def overlay_mount(self, ident: str, srcdir: Path, dstdir: Path): """ Mount `srcdir` as an overlayfs mount at `dstdir`. @@ -1350,7 +1382,9 @@ def neon_simple_env( top_output_dir: Path, test_output_dir: Path, neon_binpath: Path, + compatibility_neon_binpath: Path, pg_distrib_dir: Path, + compatibility_pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], @@ -1365,6 +1399,11 @@ def neon_simple_env( # Create the environment in the per-test output directory repo_dir = get_test_repo_dir(request, top_output_dir) + combination = ( + request._pyfuncitem.callspec.params["combination"] + if "combination" in request._pyfuncitem.callspec.params + else None + ) with NeonEnvBuilder( top_output_dir=top_output_dir, @@ -1372,7 +1411,9 @@ def neon_simple_env( port_distributor=port_distributor, mock_s3_server=mock_s3_server, neon_binpath=neon_binpath, + compatibility_neon_binpath=compatibility_neon_binpath, pg_distrib_dir=pg_distrib_dir, + compatibility_pg_distrib_dir=compatibility_pg_distrib_dir, pg_version=pg_version, run_id=run_id, preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), @@ -1382,6 +1423,7 @@ def neon_simple_env( pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, + combination=combination, ) as builder: env = builder.init_start() @@ -1395,7 +1437,9 @@ def neon_env_builder( port_distributor: PortDistributor, mock_s3_server: MockS3Server, neon_binpath: Path, + compatibility_neon_binpath: Path, pg_distrib_dir: Path, + compatibility_pg_distrib_dir: Path, pg_version: PgVersion, run_id: uuid.UUID, request: FixtureRequest, @@ -1422,6 +1466,11 @@ def neon_env_builder( # Create the environment in the test-specific output dir repo_dir = os.path.join(test_output_dir, "repo") + combination = ( + request._pyfuncitem.callspec.params["combination"] + if "combination" in request._pyfuncitem.callspec.params + else None + ) # Return the builder to the caller with NeonEnvBuilder( @@ -1430,7 +1479,10 @@ def neon_env_builder( port_distributor=port_distributor, mock_s3_server=mock_s3_server, neon_binpath=neon_binpath, + compatibility_neon_binpath=compatibility_neon_binpath, pg_distrib_dir=pg_distrib_dir, + compatibility_pg_distrib_dir=compatibility_pg_distrib_dir, + combination=combination, pg_version=pg_version, run_id=run_id, preserve_database_files=cast(bool, pytestconfig.getoption("--preserve-database-files")), diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py index cffeb47ee8..65f8e432b0 100644 --- a/test_runner/fixtures/paths.py +++ b/test_runner/fixtures/paths.py @@ -95,7 +95,29 @@ def neon_binpath(base_dir: Path, build_type: str) -> Iterator[Path]: if not (binpath / "pageserver").exists(): raise Exception(f"neon binaries not found at '{binpath}'") - yield binpath + yield binpath.absolute() + + +@pytest.fixture(scope="session") +def compatibility_snapshot_dir() -> Iterator[Path]: + if os.getenv("REMOTE_ENV"): + return + compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") + assert ( + compatibility_snapshot_dir_env is not None + ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg(PG_VERSION)` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" + compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() + yield compatibility_snapshot_dir + + +@pytest.fixture(scope="session") +def compatibility_neon_binpath() -> Optional[Iterator[Path]]: + if os.getenv("REMOTE_ENV"): + return + comp_binpath = None + if env_compatibility_neon_binpath := os.environ.get("COMPATIBILITY_NEON_BIN"): + comp_binpath = Path(env_compatibility_neon_binpath).resolve().absolute() + yield comp_binpath @pytest.fixture(scope="session") @@ -109,6 +131,19 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]: yield distrib_dir +@pytest.fixture(scope="session") +def compatibility_pg_distrib_dir() -> Optional[Iterator[Path]]: + compat_distrib_dir = None + if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"): + compat_distrib_dir = Path(env_compat_postgres_bin).resolve() + if not compat_distrib_dir.exists(): + raise Exception(f"compatibility postgres directory not found at {compat_distrib_dir}") + + if compat_distrib_dir: + log.info(f"compatibility_pg_distrib_dir is {compat_distrib_dir}") + yield compat_distrib_dir + + @pytest.fixture(scope="session") def top_output_dir(base_dir: Path) -> Iterator[Path]: # Compute the top-level directory for all tests. diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index ca1be35880..76575d330c 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -37,6 +37,23 @@ if TYPE_CHECKING: Fn = TypeVar("Fn", bound=Callable[..., Any]) +COMPONENT_BINARIES = { + "storage_controller": ("storage_controller",), + "storage_broker": ("storage_broker",), + "compute": ("compute_ctl",), + "safekeeper": ("safekeeper",), + "pageserver": ("pageserver", "pagectl"), +} +# Disable auto-formatting for better readability +# fmt: off +VERSIONS_COMBINATIONS = ( + {"storage_controller": "new", "storage_broker": "new", "compute": "new", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "old"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "old", "pageserver": "new"}, + {"storage_controller": "new", "storage_broker": "new", "compute": "old", "safekeeper": "new", "pageserver": "new"}, + {"storage_controller": "old", "storage_broker": "old", "compute": "new", "safekeeper": "new", "pageserver": "new"}, +) +# fmt: on def subprocess_capture( @@ -607,3 +624,19 @@ def human_bytes(amt: float) -> str: amt = amt / 1024 raise RuntimeError("unreachable") + + +def allpairs_versions(): + """ + Returns a dictionary with arguments for pytest parametrize + to test the compatibility with the previous version of Neon components + combinations were pre-computed to test all the pairs of the components with + the different versions. + """ + ids = [] + for pair in VERSIONS_COMBINATIONS: + cur_id = [] + for component in sorted(pair.keys()): + cur_id.append(pair[component][0]) + ids.append(f"combination_{''.join(cur_id)}") + return {"argnames": "combination", "argvalues": VERSIONS_COMBINATIONS, "ids": ids} diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 791e38383e..96ba3dd5a4 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -9,6 +9,7 @@ from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING +import fixtures.utils import pytest import toml from fixtures.common_types import TenantId, TimelineId @@ -93,6 +94,34 @@ if TYPE_CHECKING: # # Run forward compatibility test # ./scripts/pytest -k test_forward_compatibility # +# +# How to run `test_version_mismatch` locally: +# +# export DEFAULT_PG_VERSION=16 +# export BUILD_TYPE=release +# export CHECK_ONDISK_DATA_COMPATIBILITY=true +# export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} +# export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export NEON_BIN=target/release +# export POSTGRES_DISTRIB_DIR=pg_install +# +# # Build previous version of binaries and store them somewhere: +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# mkdir -p neon_previous/target +# cp -a target/${BUILD_TYPE} ./neon_previous/target/${BUILD_TYPE} +# cp -a pg_install ./neon_previous/pg_install +# +# # Build current version of binaries and create a data snapshot: +# rm -rf pg_install target +# git checkout +# CARGO_BUILD_FLAGS="--features=testing" make -s -j`nproc` +# ./scripts/pytest -k test_create_snapshot +# +# # Run the version mismatch test +# ./scripts/pytest -k test_version_mismatch + check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif( os.environ.get("CHECK_ONDISK_DATA_COMPATIBILITY") is None, @@ -166,16 +195,11 @@ def test_backward_compatibility( neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion, + compatibility_snapshot_dir: Path, ): """ Test that the new binaries can read old data """ - compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR") - assert ( - compatibility_snapshot_dir_env is not None - ), f"COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg{pg_version.v_prefixed}` path generateted by test_create_snapshot (ideally generated by the previous version of Neon)" - compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve() - breaking_changes_allowed = ( os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) @@ -214,27 +238,11 @@ def test_forward_compatibility( test_output_dir: Path, top_output_dir: Path, pg_version: PgVersion, + compatibility_snapshot_dir: Path, ): """ Test that the old binaries can read new data """ - compatibility_neon_bin_env = os.environ.get("COMPATIBILITY_NEON_BIN") - assert compatibility_neon_bin_env is not None, ( - "COMPATIBILITY_NEON_BIN is not set. It should be set to a path with Neon binaries " - "(ideally generated by the previous version of Neon)" - ) - compatibility_neon_bin = Path(compatibility_neon_bin_env).resolve() - - compatibility_postgres_distrib_dir_env = os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR") - assert ( - compatibility_postgres_distrib_dir_env is not None - ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)" - compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve() - - compatibility_snapshot_dir = ( - top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}" - ) - breaking_changes_allowed = ( os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true" ) @@ -245,9 +253,14 @@ def test_forward_compatibility( # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.). # But always use the current version's neon_local binary. # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI. - neon_env_builder.neon_binpath = compatibility_neon_bin - neon_env_builder.pg_distrib_dir = compatibility_postgres_distrib_dir - neon_env_builder.neon_local_binpath = neon_env_builder.neon_local_binpath + assert ( + neon_env_builder.compatibility_neon_binpath is not None + ), "the environment variable COMPATIBILITY_NEON_BIN is required" + assert ( + neon_env_builder.compatibility_pg_distrib_dir is not None + ), "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required" + neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath + neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", @@ -558,3 +571,29 @@ def test_historic_storage_formats( env.pageserver.http_client().timeline_compact( dataset.tenant_id, existing_timeline_id, force_image_layer_creation=True ) + + +@check_ondisk_data_compatibility_if_enabled +@pytest.mark.xdist_group("compatibility") +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +def test_versions_mismatch( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_version: PgVersion, + compatibility_snapshot_dir, + combination, +): + """ + Checks compatibility of different combinations of versions of the components + """ + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.from_repo_dir( + compatibility_snapshot_dir / "repo", + ) + env.pageserver.allowed_errors.extend( + [".*ingesting record with timestamp lagging more than wait_lsn_timeout.+"] + ) + env.start() + check_neon_works( + env, test_output_dir, compatibility_snapshot_dir / "dump.sql", test_output_dir / "repo" + ) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 1dcf0b254d..1dcc37c407 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -9,6 +9,7 @@ from datetime import datetime, timezone from enum import Enum from typing import TYPE_CHECKING +import fixtures.utils import pytest from fixtures.auth_tokens import TokenScope from fixtures.common_types import TenantId, TenantShardId, TimelineId @@ -38,7 +39,11 @@ from fixtures.pg_version import PgVersion, run_only_on_default_postgres from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.storage_controller_proxy import StorageControllerProxy -from fixtures.utils import run_pg_bench_small, subprocess_capture, wait_until +from fixtures.utils import ( + run_pg_bench_small, + subprocess_capture, + wait_until, +) from fixtures.workload import Workload from mypy_boto3_s3.type_defs import ( ObjectTypeDef, @@ -60,9 +65,8 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids): return counts -def test_storage_controller_smoke( - neon_env_builder: NeonEnvBuilder, -): +@pytest.mark.parametrize(**fixtures.utils.allpairs_versions()) +def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination): """ Test the basic lifecycle of a storage controller: - Restarting From 5ef805e12c0e14e222609c51337cf9afcddf3b92 Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Fri, 11 Oct 2024 16:58:41 +0100 Subject: [PATCH 14/48] CI(run-python-test-set): allow to skip missing compatibility snapshot (#9365) ## Problem Action `run-python-test-set` fails if it is not used for `regress_tests` on release PR, because it expects `test_compatibility.py::test_create_snapshot` to generate a snapshot, and the test exists only in `regress_tests` suite. For example, in https://github.com/neondatabase/neon/pull/9291 [`test-postgres-client-libs`](https://github.com/neondatabase/neon/actions/runs/11209615321/job/31155111544) job failed. ## Summary of changes - Add `skip-if-does-not-exist` input to `.github/actions/upload` action (the same way we do for `.github/actions/download`) - Set `skip-if-does-not-exist=true` for "Upload compatibility snapshot" step in `run-python-test-set` action --- .github/actions/run-python-test-set/action.yml | 3 +++ .github/actions/upload/action.yml | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 4008cd0d36..330e875d56 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -218,6 +218,9 @@ runs: name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ + # The lack of compatibility snapshot shouldn't fail the job + # (for example if we didn't run the test for non build-and-test workflow) + skip-if-does-not-exist: true - name: Upload test results if: ${{ !cancelled() }} diff --git a/.github/actions/upload/action.yml b/.github/actions/upload/action.yml index edcece7d2b..8a4cfe2eff 100644 --- a/.github/actions/upload/action.yml +++ b/.github/actions/upload/action.yml @@ -7,6 +7,10 @@ inputs: path: description: "A directory or file to upload" required: true + skip-if-does-not-exist: + description: "Allow to skip if path doesn't exist, fail otherwise" + default: false + required: false prefix: description: "S3 prefix. Default is '${GITHUB_SHA}/${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'" required: false @@ -15,10 +19,12 @@ runs: using: "composite" steps: - name: Prepare artifact + id: prepare-artifact shell: bash -euxo pipefail {0} env: SOURCE: ${{ inputs.path }} ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst + SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }} run: | mkdir -p $(dirname $ARCHIVE) @@ -33,14 +39,22 @@ runs: elif [ -f ${SOURCE} ]; then time tar -cf ${ARCHIVE} --zstd ${SOURCE} elif ! ls ${SOURCE} > /dev/null 2>&1; then - echo >&2 "${SOURCE} does not exist" - exit 2 + if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then + echo 'SKIPPED=true' >> $GITHUB_OUTPUT + exit 0 + else + echo >&2 "${SOURCE} does not exist" + exit 2 + fi else echo >&2 "${SOURCE} is neither a directory nor a file, do not know how to handle it" exit 3 fi + echo 'SKIPPED=false' >> $GITHUB_OUTPUT + - name: Upload artifact + if: ${{ steps.prepare-artifact.outputs.SKIPPED == 'false' }} shell: bash -euxo pipefail {0} env: SOURCE: ${{ inputs.path }} From ab5bbb445bcd76410d884f3431a4dcba3ec8fb37 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 11 Oct 2024 21:14:52 +0200 Subject: [PATCH 15/48] proxy: refactor auth backends (#9271) preliminary for #9270 The auth::Backend didn't need to be in the mega ProxyConfig object, so I split it off and passed it manually in the few places it was necessary. I've also refined some of the uses of config I saw while doing this small refactor. I've also followed the trend and make the console redirect backend it's own struct, same as LocalBackend and ControlPlaneBackend. --- proxy/src/auth/backend/console_redirect.rs | 25 +++- proxy/src/auth/backend/mod.rs | 19 ++- proxy/src/bin/local_proxy.rs | 25 +++- proxy/src/bin/proxy.rs | 154 +++++++++++---------- proxy/src/config.rs | 6 +- proxy/src/proxy/mod.rs | 7 +- proxy/src/serverless/backend.rs | 60 ++++---- proxy/src/serverless/mod.rs | 3 + proxy/src/serverless/sql_over_http.rs | 44 ++---- proxy/src/serverless/websocket.rs | 2 + 10 files changed, 186 insertions(+), 159 deletions(-) diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index a7cc678187..127be545e1 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -25,6 +25,10 @@ pub(crate) enum WebAuthError { Io(#[from] std::io::Error), } +pub struct ConsoleRedirectBackend { + console_uri: reqwest::Url, +} + impl UserFacingError for WebAuthError { fn to_string_client(&self) -> String { "Internal error".to_string() @@ -57,7 +61,26 @@ pub(crate) fn new_psql_session_id() -> String { hex::encode(rand::random::<[u8; 8]>()) } -pub(super) async fn authenticate( +impl ConsoleRedirectBackend { + pub fn new(console_uri: reqwest::Url) -> Self { + Self { console_uri } + } + + pub(super) fn url(&self) -> &reqwest::Url { + &self.console_uri + } + + pub(crate) async fn authenticate( + &self, + ctx: &RequestMonitoring, + auth_config: &'static AuthenticationConfig, + client: &mut PqStream, + ) -> auth::Result { + authenticate(ctx, auth_config, &self.console_uri, client).await + } +} + +async fn authenticate( ctx: &RequestMonitoring, auth_config: &'static AuthenticationConfig, link_uri: &reqwest::Url, diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index c9aa5b7e61..27c9f1876e 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -8,6 +8,7 @@ use std::net::IpAddr; use std::sync::Arc; use std::time::Duration; +pub use console_redirect::ConsoleRedirectBackend; pub(crate) use console_redirect::WebAuthError; use ipnet::{Ipv4Net, Ipv6Net}; use local::LocalBackend; @@ -36,7 +37,7 @@ use crate::{ provider::{CachedAllowedIps, CachedNodeInfo}, Api, }, - stream, url, + stream, }; use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; @@ -69,7 +70,7 @@ pub enum Backend<'a, T, D> { /// Cloud API (V2). ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T), /// Authentication via a web browser. - ConsoleRedirect(MaybeOwned<'a, url::ApiUrl>, D), + ConsoleRedirect(MaybeOwned<'a, ConsoleRedirectBackend>, D), /// Local proxy uses configured auth credentials and does not wake compute Local(MaybeOwned<'a, LocalBackend>), } @@ -106,9 +107,9 @@ impl std::fmt::Display for Backend<'_, (), ()> { #[cfg(test)] ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, - Self::ConsoleRedirect(url, ()) => fmt + Self::ConsoleRedirect(backend, ()) => fmt .debug_tuple("ConsoleRedirect") - .field(&url.as_str()) + .field(&backend.url().as_str()) .finish(), Self::Local(_) => fmt.debug_tuple("Local").finish(), } @@ -241,7 +242,6 @@ impl AuthenticationConfig { pub(crate) fn check_rate_limit( &self, ctx: &RequestMonitoring, - config: &AuthenticationConfig, secret: AuthSecret, endpoint: &EndpointId, is_cleartext: bool, @@ -265,7 +265,7 @@ impl AuthenticationConfig { let limit_not_exceeded = self.rate_limiter.check( ( endpoint_int, - MaskedIp::new(ctx.peer_addr(), config.rate_limit_ip_subnet), + MaskedIp::new(ctx.peer_addr(), self.rate_limit_ip_subnet), ), password_weight, ); @@ -339,7 +339,6 @@ async fn auth_quirks( let secret = if let Some(secret) = secret { config.check_rate_limit( ctx, - config, secret, &info.endpoint, unauthenticated_password.is_some() || allow_cleartext, @@ -456,12 +455,12 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { Backend::ControlPlane(api, credentials) } // NOTE: this auth backend doesn't use client credentials. - Self::ConsoleRedirect(url, ()) => { + Self::ConsoleRedirect(backend, ()) => { info!("performing web authentication"); - let info = console_redirect::authenticate(ctx, config, &url, client).await?; + let info = backend.authenticate(ctx, config, client).await?; - Backend::ConsoleRedirect(url, info) + Backend::ConsoleRedirect(backend, info) } Self::Local(_) => { return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index ae8a7f0841..c781af846a 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -6,9 +6,12 @@ use compute_api::spec::LocalProxySpec; use dashmap::DashMap; use futures::future::Either; use proxy::{ - auth::backend::{ - jwt::JwkCache, - local::{LocalBackend, JWKS_ROLE_MAP}, + auth::{ + self, + backend::{ + jwt::JwkCache, + local::{LocalBackend, JWKS_ROLE_MAP}, + }, }, cancellation::CancellationHandlerMain, config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, @@ -132,6 +135,7 @@ async fn main() -> anyhow::Result<()> { let args = LocalProxyCliArgs::parse(); let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args)?; // before we bind to any ports, write the process ID to a file // so that compute-ctl can find our process later @@ -193,6 +197,7 @@ async fn main() -> anyhow::Result<()> { let task = serverless::task_main( config, + auth_backend, http_listener, shutdown.clone(), Arc::new(CancellationHandlerMain::new( @@ -257,9 +262,6 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig Ok(Box::leak(Box::new(ProxyConfig { tls_config: None, - auth_backend: proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( - LocalBackend::new(args.compute), - )), metric_collection: None, allow_self_signed_compute: false, http_config, @@ -286,6 +288,17 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig }))) } +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend( + args: &LocalProxyCliArgs, +) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { + let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( + LocalBackend::new(args.compute), + )); + + Ok(Box::leak(Box::new(auth_backend))) +} + async fn refresh_config_loop(path: Utf8PathBuf, rx: Arc) { loop { rx.notified().await; diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7488cce3c4..3f4c2df809 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -10,6 +10,7 @@ use futures::future::Either; use proxy::auth; use proxy::auth::backend::jwt::JwkCache; use proxy::auth::backend::AuthRateLimiter; +use proxy::auth::backend::ConsoleRedirectBackend; use proxy::auth::backend::MaybeOwned; use proxy::cancellation::CancelMap; use proxy::cancellation::CancellationHandler; @@ -311,8 +312,9 @@ async fn main() -> anyhow::Result<()> { let args = ProxyCliArgs::parse(); let config = build_config(&args)?; + let auth_backend = build_auth_backend(&args)?; - info!("Authentication backend: {}", config.auth_backend); + info!("Authentication backend: {}", auth_backend); info!("Using region: {}", args.aws_region); let region_provider = @@ -462,6 +464,7 @@ async fn main() -> anyhow::Result<()> { if let Some(proxy_listener) = proxy_listener { client_tasks.spawn(proxy::proxy::task_main( config, + auth_backend, proxy_listener, cancellation_token.clone(), cancellation_handler.clone(), @@ -472,6 +475,7 @@ async fn main() -> anyhow::Result<()> { if let Some(serverless_listener) = serverless_listener { client_tasks.spawn(serverless::task_main( config, + auth_backend, serverless_listener, cancellation_token.clone(), cancellation_handler.clone(), @@ -506,7 +510,7 @@ async fn main() -> anyhow::Result<()> { )); } - if let auth::Backend::ControlPlane(api, _) = &config.auth_backend { + if let auth::Backend::ControlPlane(api, _) = auth_backend { if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api { match (redis_notifications_client, regional_redis_client.clone()) { (None, None) => {} @@ -610,6 +614,80 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { bail!("dynamic rate limiter should be disabled"); } + let config::ConcurrencyLockOptions { + shards, + limiter, + epoch, + timeout, + } = args.connect_compute_lock.parse()?; + info!( + ?limiter, + shards, + ?epoch, + "Using NodeLocks (connect_compute)" + ); + let connect_compute_locks = control_plane::locks::ApiLocks::new( + "connect_compute_lock", + limiter, + shards, + timeout, + epoch, + &Metrics::get().proxy.connect_compute_lock, + )?; + + let http_config = HttpConfig { + accept_websockets: !args.is_auth_broker, + pool_options: GlobalConnPoolOptions { + max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, + gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, + pool_shards: args.sql_over_http.sql_over_http_pool_shards, + idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, + opt_in: args.sql_over_http.sql_over_http_pool_opt_in, + max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, + }, + cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), + client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, + max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, + max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, + }; + let authentication_config = AuthenticationConfig { + jwks_cache: JwkCache::default(), + thread_pool, + scram_protocol_timeout: args.scram_protocol_timeout, + rate_limiter_enabled: args.auth_rate_limit_enabled, + rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), + rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, + ip_allowlist_check_enabled: !args.is_private_access_proxy, + is_auth_broker: args.is_auth_broker, + accept_jwts: args.is_auth_broker, + webauth_confirmation_timeout: args.webauth_confirmation_timeout, + }; + + let config = Box::leak(Box::new(ProxyConfig { + tls_config, + metric_collection, + allow_self_signed_compute: args.allow_self_signed_compute, + http_config, + authentication_config, + proxy_protocol_v2: args.proxy_protocol_v2, + handshake_timeout: args.handshake_timeout, + region: args.region.clone(), + wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, + connect_compute_locks, + connect_to_compute_retry_config: config::RetryConfig::parse( + &args.connect_to_compute_retry, + )?, + })); + + tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); + + Ok(config) +} + +/// auth::Backend is created at proxy startup, and lives forever. +fn build_auth_backend( + args: &ProxyCliArgs, +) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { let auth_backend = match &args.auth_backend { AuthBackendType::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; @@ -665,7 +743,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { AuthBackendType::Web => { let url = args.uri.parse()?; - auth::Backend::ConsoleRedirect(MaybeOwned::Owned(url), ()) + auth::Backend::ConsoleRedirect(MaybeOwned::Owned(ConsoleRedirectBackend::new(url)), ()) } #[cfg(feature = "testing")] @@ -677,75 +755,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } }; - let config::ConcurrencyLockOptions { - shards, - limiter, - epoch, - timeout, - } = args.connect_compute_lock.parse()?; - info!( - ?limiter, - shards, - ?epoch, - "Using NodeLocks (connect_compute)" - ); - let connect_compute_locks = control_plane::locks::ApiLocks::new( - "connect_compute_lock", - limiter, - shards, - timeout, - epoch, - &Metrics::get().proxy.connect_compute_lock, - )?; - - let http_config = HttpConfig { - accept_websockets: !args.is_auth_broker, - pool_options: GlobalConnPoolOptions { - max_conns_per_endpoint: args.sql_over_http.sql_over_http_pool_max_conns_per_endpoint, - gc_epoch: args.sql_over_http.sql_over_http_pool_gc_epoch, - pool_shards: args.sql_over_http.sql_over_http_pool_shards, - idle_timeout: args.sql_over_http.sql_over_http_idle_timeout, - opt_in: args.sql_over_http.sql_over_http_pool_opt_in, - max_total_conns: args.sql_over_http.sql_over_http_pool_max_total_conns, - }, - cancel_set: CancelSet::new(args.sql_over_http.sql_over_http_cancel_set_shards), - client_conn_threshold: args.sql_over_http.sql_over_http_client_conn_threshold, - max_request_size_bytes: args.sql_over_http.sql_over_http_max_request_size_bytes, - max_response_size_bytes: args.sql_over_http.sql_over_http_max_response_size_bytes, - }; - let authentication_config = AuthenticationConfig { - jwks_cache: JwkCache::default(), - thread_pool, - scram_protocol_timeout: args.scram_protocol_timeout, - rate_limiter_enabled: args.auth_rate_limit_enabled, - rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()), - rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet, - ip_allowlist_check_enabled: !args.is_private_access_proxy, - is_auth_broker: args.is_auth_broker, - accept_jwts: args.is_auth_broker, - webauth_confirmation_timeout: args.webauth_confirmation_timeout, - }; - - let config = Box::leak(Box::new(ProxyConfig { - tls_config, - auth_backend, - metric_collection, - allow_self_signed_compute: args.allow_self_signed_compute, - http_config, - authentication_config, - proxy_protocol_v2: args.proxy_protocol_v2, - handshake_timeout: args.handshake_timeout, - region: args.region.clone(), - wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?, - connect_compute_locks, - connect_to_compute_retry_config: config::RetryConfig::parse( - &args.connect_to_compute_retry, - )?, - })); - - tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); - - Ok(config) + Ok(Box::leak(Box::new(auth_backend))) } #[cfg(test)] diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 55d0b6374c..c068fc50fb 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,8 +1,5 @@ use crate::{ - auth::{ - self, - backend::{jwt::JwkCache, AuthRateLimiter}, - }, + auth::backend::{jwt::JwkCache, AuthRateLimiter}, control_plane::locks::ApiLocks, rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, scram::threadpool::ThreadPool, @@ -29,7 +26,6 @@ use x509_parser::oid_registry; pub struct ProxyConfig { pub tls_config: Option, - pub auth_backend: auth::Backend<'static, (), ()>, pub metric_collection: Option, pub allow_self_signed_compute: bool, pub http_config: HttpConfig, diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 9e1af88f41..3a43ccb74a 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -61,6 +61,7 @@ pub async fn run_until_cancelled( pub async fn task_main( config: &'static ProxyConfig, + auth_backend: &'static auth::Backend<'static, (), ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -129,6 +130,7 @@ pub async fn task_main( let startup = Box::pin( handle_client( config, + auth_backend, &ctx, cancellation_handler, socket, @@ -243,8 +245,10 @@ impl ReportableError for ClientRequestError { } } +#[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, + auth_backend: &'static auth::Backend<'static, (), ()>, ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, @@ -285,8 +289,7 @@ pub(crate) async fn handle_client( let common_names = tls.map(|tls| &tls.common_names); // Extract credentials which we're going to use for auth. - let result = config - .auth_backend + let result = auth_backend .as_ref() .map(|()| auth::ComputeUserInfoMaybeEndpoint::parse(ctx, ¶ms, hostname, common_names)) .transpose(); diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index f54476b51d..9e49478cf3 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -13,7 +13,7 @@ use crate::{ check_peer_addr_is_in_list, AuthError, }, compute, - config::{AuthenticationConfig, ProxyConfig}, + config::ProxyConfig, context::RequestMonitoring, control_plane::{ errors::{GetAuthInfoError, WakeComputeError}, @@ -42,6 +42,7 @@ pub(crate) struct PoolingBackend { pub(crate) local_pool: Arc>, pub(crate) pool: Arc>, pub(crate) config: &'static ProxyConfig, + pub(crate) auth_backend: &'static crate::auth::Backend<'static, (), ()>, pub(crate) endpoint_rate_limiter: Arc, } @@ -49,18 +50,13 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_password( &self, ctx: &RequestMonitoring, - config: &AuthenticationConfig, user_info: &ComputeUserInfo, password: &[u8], ) -> Result { let user_info = user_info.clone(); - let backend = self - .config - .auth_backend - .as_ref() - .map(|()| user_info.clone()); + let backend = self.auth_backend.as_ref().map(|()| user_info.clone()); let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?; - if config.ip_allowlist_check_enabled + if self.config.authentication_config.ip_allowlist_check_enabled && !check_peer_addr_is_in_list(&ctx.peer_addr(), &allowed_ips) { return Err(AuthError::ip_address_not_allowed(ctx.peer_addr())); @@ -79,7 +75,6 @@ impl PoolingBackend { let secret = match cached_secret.value.clone() { Some(secret) => self.config.authentication_config.check_rate_limit( ctx, - config, secret, &user_info.endpoint, true, @@ -91,9 +86,13 @@ impl PoolingBackend { } }; let ep = EndpointIdInt::from(&user_info.endpoint); - let auth_outcome = - crate::auth::validate_password_and_exchange(&config.thread_pool, ep, password, secret) - .await?; + let auth_outcome = crate::auth::validate_password_and_exchange( + &self.config.authentication_config.thread_pool, + ep, + password, + secret, + ) + .await?; let res = match auth_outcome { crate::sasl::Outcome::Success(key) => { info!("user successfully authenticated"); @@ -113,13 +112,13 @@ impl PoolingBackend { pub(crate) async fn authenticate_with_jwt( &self, ctx: &RequestMonitoring, - config: &AuthenticationConfig, user_info: &ComputeUserInfo, jwt: String, ) -> Result { - match &self.config.auth_backend { + match &self.auth_backend { crate::auth::Backend::ControlPlane(console, ()) => { - config + self.config + .authentication_config .jwks_cache .check_jwt( ctx, @@ -140,7 +139,9 @@ impl PoolingBackend { "JWT login over web auth proxy is not supported", )), crate::auth::Backend::Local(_) => { - let keys = config + let keys = self + .config + .authentication_config .jwks_cache .check_jwt( ctx, @@ -185,7 +186,7 @@ impl PoolingBackend { let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - let backend = self.config.auth_backend.as_ref().map(|()| keys); + let backend = self.auth_backend.as_ref().map(|()| keys); crate::proxy::connect_compute::connect_to_compute( ctx, &TokioMechanism { @@ -217,21 +218,14 @@ impl PoolingBackend { let conn_id = uuid::Uuid::new_v4(); tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "pool: opening a new connection '{conn_info}'"); - let backend = self - .config - .auth_backend - .as_ref() - .map(|()| ComputeCredentials { - info: ComputeUserInfo { - user: conn_info.user_info.user.clone(), - endpoint: EndpointId::from(format!( - "{}-local-proxy", - conn_info.user_info.endpoint - )), - options: conn_info.user_info.options.clone(), - }, - keys: crate::auth::backend::ComputeCredentialKeys::None, - }); + let backend = self.auth_backend.as_ref().map(|()| ComputeCredentials { + info: ComputeUserInfo { + user: conn_info.user_info.user.clone(), + endpoint: EndpointId::from(format!("{}-local-proxy", conn_info.user_info.endpoint)), + options: conn_info.user_info.options.clone(), + }, + keys: crate::auth::backend::ComputeCredentialKeys::None, + }); crate::proxy::connect_compute::connect_to_compute( ctx, &HyperMechanism { @@ -269,7 +263,7 @@ impl PoolingBackend { tracing::Span::current().record("conn_id", display(conn_id)); info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); - let mut node_info = match &self.config.auth_backend { + let mut node_info = match &self.auth_backend { auth::Backend::ControlPlane(_, ()) | auth::Backend::ConsoleRedirect(_, ()) => { unreachable!("only local_proxy can connect to local postgres") } diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index b5820b0535..95f64e972c 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -55,6 +55,7 @@ pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, + auth_backend: &'static crate::auth::Backend<'static, (), ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -110,6 +111,7 @@ pub async fn task_main( local_pool, pool: Arc::clone(&conn_pool), config, + auth_backend, endpoint_rate_limiter: Arc::clone(&endpoint_rate_limiter), }); let tls_acceptor: Arc = match config.tls_config.as_ref() { @@ -397,6 +399,7 @@ async fn request_handler( async move { if let Err(e) = websocket::serve_websocket( config, + backend.auth_backend, ctx, websocket, cancellation_handler, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 646e7f8a52..cf3324926c 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -45,6 +45,7 @@ use crate::auth::backend::ComputeUserInfo; use crate::auth::endpoint_sni; use crate::auth::ComputeUserInfoParseError; use crate::config::AuthenticationConfig; +use crate::config::HttpConfig; use crate::config::ProxyConfig; use crate::config::TlsConfig; use crate::context::RequestMonitoring; @@ -554,7 +555,7 @@ async fn handle_inner( match conn_info.auth { AuthData::Jwt(jwt) if config.authentication_config.is_auth_broker => { - handle_auth_broker_inner(config, ctx, request, conn_info.conn_info, jwt, backend).await + handle_auth_broker_inner(ctx, request, conn_info.conn_info, jwt, backend).await } auth => { handle_db_inner( @@ -622,28 +623,17 @@ async fn handle_db_inner( let authenticate_and_connect = Box::pin( async { - let is_local_proxy = - matches!(backend.config.auth_backend, crate::auth::Backend::Local(_)); + let is_local_proxy = matches!(backend.auth_backend, crate::auth::Backend::Local(_)); let keys = match auth { AuthData::Password(pw) => { backend - .authenticate_with_password( - ctx, - &config.authentication_config, - &conn_info.user_info, - &pw, - ) + .authenticate_with_password(ctx, &conn_info.user_info, &pw) .await? } AuthData::Jwt(jwt) => { backend - .authenticate_with_jwt( - ctx, - &config.authentication_config, - &conn_info.user_info, - jwt, - ) + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .await? } }; @@ -691,7 +681,7 @@ async fn handle_db_inner( // Now execute the query and return the result. let json_output = match payload { Payload::Single(stmt) => { - stmt.process(config, cancel, &mut client, parsed_headers) + stmt.process(&config.http_config, cancel, &mut client, parsed_headers) .await? } Payload::Batch(statements) => { @@ -709,7 +699,7 @@ async fn handle_db_inner( } statements - .process(config, cancel, &mut client, parsed_headers) + .process(&config.http_config, cancel, &mut client, parsed_headers) .await? } }; @@ -749,7 +739,6 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[ ]; async fn handle_auth_broker_inner( - config: &'static ProxyConfig, ctx: &RequestMonitoring, request: Request, conn_info: ConnInfo, @@ -757,12 +746,7 @@ async fn handle_auth_broker_inner( backend: Arc, ) -> Result>, SqlOverHttpError> { backend - .authenticate_with_jwt( - ctx, - &config.authentication_config, - &conn_info.user_info, - jwt, - ) + .authenticate_with_jwt(ctx, &conn_info.user_info, jwt) .await .map_err(HttpConnError::from)?; @@ -800,7 +784,7 @@ async fn handle_auth_broker_inner( impl QueryData { async fn process( self, - config: &'static ProxyConfig, + config: &'static HttpConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -874,7 +858,7 @@ impl QueryData { impl BatchQueryData { async fn process( self, - config: &'static ProxyConfig, + config: &'static HttpConfig, cancel: CancellationToken, client: &mut Client, parsed_headers: HttpHeaders, @@ -944,7 +928,7 @@ impl BatchQueryData { } async fn query_batch( - config: &'static ProxyConfig, + config: &'static HttpConfig, cancel: CancellationToken, transaction: &Transaction<'_>, queries: BatchQueryData, @@ -983,7 +967,7 @@ async fn query_batch( } async fn query_to_json( - config: &'static ProxyConfig, + config: &'static HttpConfig, client: &T, data: QueryData, current_size: &mut usize, @@ -1004,9 +988,9 @@ async fn query_to_json( rows.push(row); // we don't have a streaming response support yet so this is to prevent OOM // from a malicious query (eg a cross join) - if *current_size > config.http_config.max_response_size_bytes { + if *current_size > config.max_response_size_bytes { return Err(SqlOverHttpError::ResponseTooLarge( - config.http_config.max_response_size_bytes, + config.max_response_size_bytes, )); } } diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index 08d5da9bef..fd0f0cac7f 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -129,6 +129,7 @@ impl AsyncBufRead for WebSocketRw { pub(crate) async fn serve_websocket( config: &'static ProxyConfig, + auth_backend: &'static crate::auth::Backend<'static, (), ()>, ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, @@ -145,6 +146,7 @@ pub(crate) async fn serve_websocket( let res = Box::pin(handle_client( config, + auth_backend, &ctx, cancellation_handler, WebSocketRw::new(websocket), From cb9ab7463c27f30532f944ff9d3adb0636e42364 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 14 Oct 2024 12:25:55 +0200 Subject: [PATCH 16/48] proxy: split out the console-redirect backend flow (#9270) removes the ConsoleRedirect backend from the main auth::Backends enum, copy-paste the existing crate::proxy::task_main structure to use the ConsoleRedirectBackend exclusively. This makes the logic a bit simpler at the cost of some fairly trivial code duplication. --- proxy/src/auth/backend/console_redirect.rs | 37 +++- proxy/src/auth/backend/mod.rs | 72 ++----- proxy/src/bin/local_proxy.rs | 2 +- proxy/src/bin/proxy.rs | 99 ++++++---- proxy/src/console_redirect_proxy.rs | 217 +++++++++++++++++++++ proxy/src/lib.rs | 1 + proxy/src/proxy/mod.rs | 6 +- proxy/src/proxy/tests/mod.rs | 2 +- proxy/src/serverless/backend.rs | 7 +- proxy/src/serverless/mod.rs | 2 +- proxy/src/serverless/websocket.rs | 2 +- 11 files changed, 334 insertions(+), 113 deletions(-) create mode 100644 proxy/src/console_redirect_proxy.rs diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 127be545e1..457410ec8c 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,18 +1,24 @@ use crate::{ - auth, compute, + auth, + cache::Cached, + compute, config::AuthenticationConfig, context::RequestMonitoring, - control_plane::{self, provider::NodeInfo}, + control_plane::{self, provider::NodeInfo, CachedNodeInfo}, error::{ReportableError, UserFacingError}, + proxy::connect_compute::ComputeConnectBackend, stream::PqStream, waiters, }; +use async_trait::async_trait; use pq_proto::BeMessage as Be; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_postgres::config::SslMode; use tracing::{info, info_span}; +use super::ComputeCredentialKeys; + #[derive(Debug, Error)] pub(crate) enum WebAuthError { #[error(transparent)] @@ -25,6 +31,7 @@ pub(crate) enum WebAuthError { Io(#[from] std::io::Error), } +#[derive(Debug)] pub struct ConsoleRedirectBackend { console_uri: reqwest::Url, } @@ -66,17 +73,31 @@ impl ConsoleRedirectBackend { Self { console_uri } } - pub(super) fn url(&self) -> &reqwest::Url { - &self.console_uri - } - pub(crate) async fn authenticate( &self, ctx: &RequestMonitoring, auth_config: &'static AuthenticationConfig, client: &mut PqStream, - ) -> auth::Result { - authenticate(ctx, auth_config, &self.console_uri, client).await + ) -> auth::Result { + authenticate(ctx, auth_config, &self.console_uri, client) + .await + .map(ConsoleRedirectNodeInfo) + } +} + +pub struct ConsoleRedirectNodeInfo(pub(super) NodeInfo); + +#[async_trait] +impl ComputeConnectBackend for ConsoleRedirectNodeInfo { + async fn wake_compute( + &self, + _ctx: &RequestMonitoring, + ) -> Result { + Ok(Cached::new_uncached(self.0.clone())) + } + + fn get_keys(&self) -> &ComputeCredentialKeys { + &ComputeCredentialKeys::None } } diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 27c9f1876e..96e1a787ed 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -22,7 +22,7 @@ use crate::cache::Cached; use crate::context::RequestMonitoring; use crate::control_plane::errors::GetAuthInfoError; use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend}; -use crate::control_plane::{AuthSecret, NodeInfo}; +use crate::control_plane::AuthSecret; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; @@ -66,11 +66,9 @@ impl std::ops::Deref for MaybeOwned<'_, T> { /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`], /// this helps us provide the credentials only to those auth /// backends which require them for the authentication process. -pub enum Backend<'a, T, D> { +pub enum Backend<'a, T> { /// Cloud API (V2). ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T), - /// Authentication via a web browser. - ConsoleRedirect(MaybeOwned<'a, ConsoleRedirectBackend>, D), /// Local proxy uses configured auth credentials and does not wake compute Local(MaybeOwned<'a, LocalBackend>), } @@ -91,7 +89,7 @@ impl Clone for Box { } } -impl std::fmt::Display for Backend<'_, (), ()> { +impl std::fmt::Display for Backend<'_, ()> { fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::ControlPlane(api, ()) => match &**api { @@ -107,46 +105,39 @@ impl std::fmt::Display for Backend<'_, (), ()> { #[cfg(test)] ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(), }, - Self::ConsoleRedirect(backend, ()) => fmt - .debug_tuple("ConsoleRedirect") - .field(&backend.url().as_str()) - .finish(), Self::Local(_) => fmt.debug_tuple("Local").finish(), } } } -impl Backend<'_, T, D> { +impl Backend<'_, T> { /// Very similar to [`std::option::Option::as_ref`]. /// This helps us pass structured config to async tasks. - pub(crate) fn as_ref(&self) -> Backend<'_, &T, &D> { + pub(crate) fn as_ref(&self) -> Backend<'_, &T> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(MaybeOwned::Borrowed(c), x), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(MaybeOwned::Borrowed(c), x), Self::Local(l) => Backend::Local(MaybeOwned::Borrowed(l)), } } } -impl<'a, T, D> Backend<'a, T, D> { +impl<'a, T> Backend<'a, T> { /// Very similar to [`std::option::Option::map`]. /// Maps [`Backend`] to [`Backend`] by applying /// a function to a contained value. - pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R, D> { + pub(crate) fn map(self, f: impl FnOnce(T) -> R) -> Backend<'a, R> { match self { Self::ControlPlane(c, x) => Backend::ControlPlane(c, f(x)), - Self::ConsoleRedirect(c, x) => Backend::ConsoleRedirect(c, x), Self::Local(l) => Backend::Local(l), } } } -impl<'a, T, D, E> Backend<'a, Result, D> { +impl<'a, T, E> Backend<'a, Result> { /// Very similar to [`std::option::Option::transpose`]. /// This is most useful for error handling. - pub(crate) fn transpose(self) -> Result, E> { + pub(crate) fn transpose(self) -> Result, E> { match self { Self::ControlPlane(c, x) => x.map(|x| Backend::ControlPlane(c, x)), - Self::ConsoleRedirect(c, x) => Ok(Backend::ConsoleRedirect(c, x)), Self::Local(l) => Ok(Backend::Local(l)), } } @@ -414,12 +405,11 @@ async fn authenticate_with_secret( classic::authenticate(ctx, info, client, config, secret).await } -impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { +impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint> { /// Get username from the credentials. pub(crate) fn get_user(&self) -> &str { match self { Self::ControlPlane(_, user_info) => &user_info.user, - Self::ConsoleRedirect(_, ()) => "web", Self::Local(_) => "local", } } @@ -433,7 +423,7 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { allow_cleartext: bool, config: &'static AuthenticationConfig, endpoint_rate_limiter: Arc, - ) -> auth::Result> { + ) -> auth::Result> { let res = match self { Self::ControlPlane(api, user_info) => { info!( @@ -454,14 +444,6 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { .await?; Backend::ControlPlane(api, credentials) } - // NOTE: this auth backend doesn't use client credentials. - Self::ConsoleRedirect(backend, ()) => { - info!("performing web authentication"); - - let info = backend.authenticate(ctx, config, client).await?; - - Backend::ConsoleRedirect(backend, info) - } Self::Local(_) => { return Err(auth::AuthError::bad_auth_method("invalid for local proxy")) } @@ -472,14 +454,13 @@ impl<'a> Backend<'a, ComputeUserInfoMaybeEndpoint, &()> { } } -impl Backend<'_, ComputeUserInfo, &()> { +impl Backend<'_, ComputeUserInfo> { pub(crate) async fn get_role_secret( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, user_info) => api.get_role_secret(ctx, user_info).await, - Self::ConsoleRedirect(_, ()) => Ok(Cached::new_uncached(None)), Self::Local(_) => Ok(Cached::new_uncached(None)), } } @@ -492,21 +473,19 @@ impl Backend<'_, ComputeUserInfo, &()> { Self::ControlPlane(api, user_info) => { api.get_allowed_ips_and_secret(ctx, user_info).await } - Self::ConsoleRedirect(_, ()) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), Self::Local(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)), } } } #[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { +impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { async fn wake_compute( &self, ctx: &RequestMonitoring, ) -> Result { match self { Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, info) => Ok(Cached::new_uncached(info.clone())), Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), } } @@ -514,31 +493,6 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials, NodeInfo> { fn get_keys(&self) -> &ComputeCredentialKeys { match self { Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, _) => &ComputeCredentialKeys::None, - Self::Local(_) => &ComputeCredentialKeys::None, - } - } -} - -#[async_trait::async_trait] -impl ComputeConnectBackend for Backend<'_, ComputeCredentials, &()> { - async fn wake_compute( - &self, - ctx: &RequestMonitoring, - ) -> Result { - match self { - Self::ControlPlane(api, creds) => api.wake_compute(ctx, &creds.info).await, - Self::ConsoleRedirect(_, ()) => { - unreachable!("web auth flow doesn't support waking the compute") - } - Self::Local(local) => Ok(Cached::new_uncached(local.node_info.clone())), - } - } - - fn get_keys(&self) -> &ComputeCredentialKeys { - match self { - Self::ControlPlane(_, creds) => &creds.keys, - Self::ConsoleRedirect(_, ()) => &ComputeCredentialKeys::None, Self::Local(_) => &ComputeCredentialKeys::None, } } diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index c781af846a..c92ebbc51f 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -291,7 +291,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend( args: &LocalProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { +) -> anyhow::Result<&'static auth::Backend<'static, ()>> { let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned( LocalBackend::new(args.compute), )); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 3f4c2df809..3c0e66dec3 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -314,7 +314,10 @@ async fn main() -> anyhow::Result<()> { let config = build_config(&args)?; let auth_backend = build_auth_backend(&args)?; - info!("Authentication backend: {}", auth_backend); + match auth_backend { + Either::Left(auth_backend) => info!("Authentication backend: {auth_backend}"), + Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), + }; info!("Using region: {}", args.aws_region); let region_provider = @@ -461,26 +464,41 @@ async fn main() -> anyhow::Result<()> { // client facing tasks. these will exit on error or on cancellation // cancellation returns Ok(()) let mut client_tasks = JoinSet::new(); - if let Some(proxy_listener) = proxy_listener { - client_tasks.spawn(proxy::proxy::task_main( - config, - auth_backend, - proxy_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); - } + match auth_backend { + Either::Left(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } - if let Some(serverless_listener) = serverless_listener { - client_tasks.spawn(serverless::task_main( - config, - auth_backend, - serverless_listener, - cancellation_token.clone(), - cancellation_handler.clone(), - endpoint_rate_limiter.clone(), - )); + if let Some(serverless_listener) = serverless_listener { + client_tasks.spawn(serverless::task_main( + config, + auth_backend, + serverless_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + endpoint_rate_limiter.clone(), + )); + } + } + Either::Right(auth_backend) => { + if let Some(proxy_listener) = proxy_listener { + client_tasks.spawn(proxy::console_redirect_proxy::task_main( + config, + auth_backend, + proxy_listener, + cancellation_token.clone(), + cancellation_handler.clone(), + )); + } + } } client_tasks.spawn(proxy::context::parquet::worker( @@ -510,7 +528,7 @@ async fn main() -> anyhow::Result<()> { )); } - if let auth::Backend::ControlPlane(api, _) = auth_backend { + if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend { if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api { match (redis_notifications_client, regional_redis_client.clone()) { (None, None) => {} @@ -663,7 +681,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { webauth_confirmation_timeout: args.webauth_confirmation_timeout, }; - let config = Box::leak(Box::new(ProxyConfig { + let config = ProxyConfig { tls_config, metric_collection, allow_self_signed_compute: args.allow_self_signed_compute, @@ -677,7 +695,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { connect_to_compute_retry_config: config::RetryConfig::parse( &args.connect_to_compute_retry, )?, - })); + }; + + let config = Box::leak(Box::new(config)); tokio::spawn(config.connect_compute_locks.garbage_collect_worker()); @@ -687,8 +707,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { /// auth::Backend is created at proxy startup, and lives forever. fn build_auth_backend( args: &ProxyCliArgs, -) -> anyhow::Result<&'static auth::Backend<'static, (), ()>> { - let auth_backend = match &args.auth_backend { +) -> anyhow::Result, &'static ConsoleRedirectBackend>> { + match &args.auth_backend { AuthBackendType::Console => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = @@ -738,12 +758,11 @@ fn build_auth_backend( wake_compute_endpoint_rate_limiter, ); let api = control_plane::provider::ControlPlaneBackend::Management(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) - } + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); - AuthBackendType::Web => { - let url = args.uri.parse()?; - auth::Backend::ConsoleRedirect(MaybeOwned::Owned(ConsoleRedirectBackend::new(url)), ()) + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) } #[cfg(feature = "testing")] @@ -751,11 +770,23 @@ fn build_auth_backend( let url = args.auth_endpoint.parse()?; let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy); let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api); - auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()) - } - }; - Ok(Box::leak(Box::new(auth_backend))) + let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ()); + + let config = Box::leak(Box::new(auth_backend)); + + Ok(Either::Left(config)) + } + + AuthBackendType::Web => { + let url = args.uri.parse()?; + let backend = ConsoleRedirectBackend::new(url); + + let config = Box::leak(Box::new(backend)); + + Ok(Either::Right(config)) + } + } } #[cfg(test)] diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs new file mode 100644 index 0000000000..9e17976720 --- /dev/null +++ b/proxy/src/console_redirect_proxy.rs @@ -0,0 +1,217 @@ +use crate::auth::backend::ConsoleRedirectBackend; +use crate::config::{ProxyConfig, ProxyProtocolV2}; +use crate::proxy::{ + prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, +}; +use crate::{ + cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}, + context::RequestMonitoring, + error::ReportableError, + metrics::{Metrics, NumClientConnectionsGuard}, + protocol2::read_proxy_protocol, + proxy::handshake::{handshake, HandshakeData}, +}; +use futures::TryFutureExt; +use std::sync::Arc; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info, Instrument}; + +use crate::proxy::{ + connect_compute::{connect_to_compute, TcpMechanism}, + passthrough::ProxyPassthrough, +}; + +pub async fn task_main( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + listener: tokio::net::TcpListener, + cancellation_token: CancellationToken, + cancellation_handler: Arc, +) -> anyhow::Result<()> { + scopeguard::defer! { + info!("proxy has shut down"); + } + + // When set for the server socket, the keepalive setting + // will be inherited by all accepted client sockets. + socket2::SockRef::from(&listener).set_keepalive(true)?; + + let connections = tokio_util::task::task_tracker::TaskTracker::new(); + + while let Some(accept_result) = + run_until_cancelled(listener.accept(), &cancellation_token).await + { + let (socket, peer_addr) = accept_result?; + + let conn_gauge = Metrics::get() + .proxy + .client_connections + .guard(crate::metrics::Protocol::Tcp); + + let session_id = uuid::Uuid::new_v4(); + let cancellation_handler = Arc::clone(&cancellation_handler); + + tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection"); + + connections.spawn(async move { + let (socket, peer_addr) = match read_proxy_protocol(socket).await { + Err(e) => { + error!("per-client task finished with an error: {e:#}"); + return; + } + Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => { + error!("missing required proxy protocol header"); + return; + } + Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => { + error!("proxy protocol header not supported"); + return; + } + Ok((socket, Some(addr))) => (socket, addr.ip()), + Ok((socket, None)) => (socket, peer_addr.ip()), + }; + + match socket.inner.set_nodelay(true) { + Ok(()) => {} + Err(e) => { + error!("per-client task finished with an error: failed to set socket option: {e:#}"); + return; + } + }; + + let ctx = RequestMonitoring::new( + session_id, + peer_addr, + crate::metrics::Protocol::Tcp, + &config.region, + ); + let span = ctx.span(); + + let startup = Box::pin( + handle_client( + config, + backend, + &ctx, + cancellation_handler, + socket, + conn_gauge, + ) + .instrument(span.clone()), + ); + let res = startup.await; + + match res { + Err(e) => { + // todo: log and push to ctx the error kind + ctx.set_error_kind(e.get_error_kind()); + error!(parent: &span, "per-client task finished with an error: {e:#}"); + } + Ok(None) => { + ctx.set_success(); + } + Ok(Some(p)) => { + ctx.set_success(); + ctx.log_connect(); + match p.proxy_pass().instrument(span.clone()).await { + Ok(()) => {} + Err(ErrorSource::Client(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the client: {e:#}"); + } + Err(ErrorSource::Compute(e)) => { + error!(parent: &span, "per-client task finished with an IO error from the compute: {e:#}"); + } + } + } + } + }); + } + + connections.close(); + drop(listener); + + // Drain connections + connections.wait().await; + + Ok(()) +} + +pub(crate) async fn handle_client( + config: &'static ProxyConfig, + backend: &'static ConsoleRedirectBackend, + ctx: &RequestMonitoring, + cancellation_handler: Arc, + stream: S, + conn_gauge: NumClientConnectionsGuard<'static>, +) -> Result>, ClientRequestError> { + info!( + protocol = %ctx.protocol(), + "handling interactive connection from client" + ); + + let metrics = &Metrics::get().proxy; + let proto = ctx.protocol(); + let request_gauge = metrics.connection_requests.guard(proto); + + let tls = config.tls_config.as_ref(); + + let record_handshake_error = !ctx.has_private_peer_addr(); + let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Client); + let do_handshake = handshake(ctx, stream, tls, record_handshake_error); + let (mut stream, params) = + match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? { + HandshakeData::Startup(stream, params) => (stream, params), + HandshakeData::Cancel(cancel_key_data) => { + return Ok(cancellation_handler + .cancel_session(cancel_key_data, ctx.session_id()) + .await + .map(|()| None)?) + } + }; + drop(pause); + + ctx.set_db_options(params.clone()); + + let user_info = match backend + .authenticate(ctx, &config.authentication_config, &mut stream) + .await + { + Ok(auth_result) => auth_result, + Err(e) => { + return stream.throw_error(e).await?; + } + }; + + let mut node = connect_to_compute( + ctx, + &TcpMechanism { + params: ¶ms, + locks: &config.connect_compute_locks, + }, + &user_info, + config.allow_self_signed_compute, + config.wake_compute_retry_config, + config.connect_to_compute_retry_config, + ) + .or_else(|e| stream.throw_error(e)) + .await?; + + let session = cancellation_handler.get_session(); + prepare_client_connection(&node, &session, &mut stream).await?; + + // Before proxy passing, forward to compute whatever data is left in the + // PqStream input buffer. Normally there is none, but our serverless npm + // driver in pipeline mode sends startup, password and first query + // immediately after opening the connection. + let (stream, read_buf) = stream.into_inner(); + node.stream.write_all(&read_buf).await?; + + Ok(Some(ProxyPassthrough { + client: stream, + aux: node.aux.clone(), + compute: node, + _req: request_gauge, + _conn: conn_gauge, + _cancel: session, + })) +} diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index 8d274baa10..74bc778a36 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -95,6 +95,7 @@ pub mod cache; pub mod cancellation; pub mod compute; pub mod config; +pub mod console_redirect_proxy; pub mod context; pub mod control_plane; pub mod error; diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index 3a43ccb74a..b2b5a7f43d 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -61,7 +61,7 @@ pub async fn run_until_cancelled( pub async fn task_main( config: &'static ProxyConfig, - auth_backend: &'static auth::Backend<'static, (), ()>, + auth_backend: &'static auth::Backend<'static, ()>, listener: tokio::net::TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, @@ -248,7 +248,7 @@ impl ReportableError for ClientRequestError { #[allow(clippy::too_many_arguments)] pub(crate) async fn handle_client( config: &'static ProxyConfig, - auth_backend: &'static auth::Backend<'static, (), ()>, + auth_backend: &'static auth::Backend<'static, ()>, ctx: &RequestMonitoring, cancellation_handler: Arc, stream: S, @@ -356,7 +356,7 @@ pub(crate) async fn handle_client( /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] -async fn prepare_client_connection

( +pub(crate) async fn prepare_client_connection

( node: &compute::PostgresConnection, session: &cancellation::Session

, stream: &mut PqStream, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 3861ddc8ed..58fb36dba7 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -552,7 +552,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, -) -> auth::Backend<'static, ComputeCredentials, &()> { +) -> auth::Backend<'static, ComputeCredentials> { let user_info = auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))), ComputeCredentials { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 9e49478cf3..2b060af9e1 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -42,7 +42,7 @@ pub(crate) struct PoolingBackend { pub(crate) local_pool: Arc>, pub(crate) pool: Arc>, pub(crate) config: &'static ProxyConfig, - pub(crate) auth_backend: &'static crate::auth::Backend<'static, (), ()>, + pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>, pub(crate) endpoint_rate_limiter: Arc, } @@ -135,9 +135,6 @@ impl PoolingBackend { keys: crate::auth::backend::ComputeCredentialKeys::None, }) } - crate::auth::Backend::ConsoleRedirect(_, ()) => Err(AuthError::auth_failed( - "JWT login over web auth proxy is not supported", - )), crate::auth::Backend::Local(_) => { let keys = self .config @@ -264,7 +261,7 @@ impl PoolingBackend { info!(%conn_id, "local_pool: opening a new connection '{conn_info}'"); let mut node_info = match &self.auth_backend { - auth::Backend::ControlPlane(_, ()) | auth::Backend::ConsoleRedirect(_, ()) => { + auth::Backend::ControlPlane(_, ()) => { unreachable!("only local_proxy can connect to local postgres") } auth::Backend::Local(local) => local.node_info.clone(), diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 95f64e972c..3131adada4 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -55,7 +55,7 @@ pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( config: &'static ProxyConfig, - auth_backend: &'static crate::auth::Backend<'static, (), ()>, + auth_backend: &'static crate::auth::Backend<'static, ()>, ws_listener: TcpListener, cancellation_token: CancellationToken, cancellation_handler: Arc, diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index fd0f0cac7f..f5a692cf40 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -129,7 +129,7 @@ impl AsyncBufRead for WebSocketRw { pub(crate) async fn serve_websocket( config: &'static ProxyConfig, - auth_backend: &'static crate::auth::Backend<'static, (), ()>, + auth_backend: &'static crate::auth::Backend<'static, ()>, ctx: RequestMonitoring, websocket: OnUpgrade, cancellation_handler: Arc, From d056ae9be5844b22378f961dd3ae730d96ef996e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 14 Oct 2024 13:45:20 +0300 Subject: [PATCH 17/48] Ignore pg_dynshmem fiel when comparing directories (#9374) ## Problem At MacOS `pg_dynshmem` file is create in PGDATADIR which cause mismatch in directories comparison ## Summary of changes Add this files to the ignore list. ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik --- test_runner/fixtures/neon_fixtures.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7789855fe4..059707c8ed 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -4280,6 +4280,7 @@ SKIP_FILES = frozenset( "postmaster.opts", "postmaster.pid", "pg_control", + "pg_dynshmem", ) ) From 31b7703fa87fa7fdc4d3a9f8b8f223cfddc0cd1a Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 14 Oct 2024 11:51:01 +0100 Subject: [PATCH 18/48] CI(build-build-tools): fix unexpected cancellations (#9357) ## Problem When `Dockerfile.build-tools` gets changed, several PRs catch up with it and some might get unexpectedly cancelled workflows because of GitHub's concurrency model for workflows. See the comment in the code for more details. It should be possible to revert it after https://github.com/orgs/community/discussions/41518 (I don't expect it anytime soon, but I subscribed) ## Summary of changes - Do not queue `build-build-tools-image` workflows in the concurrency group --- .github/workflows/build-build-tools-image.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index ca5ff573e1..130753833d 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -19,9 +19,16 @@ defaults: run: shell: bash -euo pipefail {0} -concurrency: - group: build-build-tools-image-${{ inputs.image-tag }} - cancel-in-progress: false +# The initial idea was to prevent the waste of resources by not re-building the `build-tools` image +# for the same tag in parallel workflow runs, and queue them to be skipped once we have +# the first image pushed to Docker registry, but GitHub's concurrency mechanism is not working as expected. +# GitHub can't have more than 1 job in a queue and removes the previous one, it causes failures if the dependent jobs. +# +# Ref https://github.com/orgs/community/discussions/41518 +# +# concurrency: +# group: build-build-tools-image-${{ inputs.image-tag }} +# cancel-in-progress: false # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job. permissions: {} From d92ff578c4a738a52bdcb0a6f44af7691a64882c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 14 Oct 2024 14:34:57 +0200 Subject: [PATCH 19/48] Add test for fixed storage broker issue (#9311) Adds a test for the (now fixed) storage broker limit issue, see #9268 for the description and #9299 for the fix. Also fix a race condition with endpoint creation/starts running in parallel, leading to file not found errors. --- control_plane/src/endpoint.rs | 16 +++++++++++++- test_runner/regress/test_tenants.py | 34 ++++++++++++++++++++++++++++- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 7cdf621737..71514daa7c 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -97,7 +97,21 @@ impl ComputeControlPlane { for endpoint_dir in std::fs::read_dir(env.endpoints_path()) .with_context(|| format!("failed to list {}", env.endpoints_path().display()))? { - let ep = Endpoint::from_dir_entry(endpoint_dir?, &env)?; + let ep_res = Endpoint::from_dir_entry(endpoint_dir?, &env); + let ep = match ep_res { + Ok(ep) => ep, + Err(e) => match e.downcast::() { + Ok(e) => { + // A parallel task could delete an endpoint while we have just scanned the directory + if e.kind() == std::io::ErrorKind::NotFound { + continue; + } else { + Err(e)? + } + } + Err(e) => Err(e)?, + }, + }; endpoints.insert(ep.endpoint_id.clone(), Arc::new(ep)); } diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 95dc0fec78..4a16535941 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -2,6 +2,7 @@ from __future__ import annotations import concurrent.futures import os +import threading import time from contextlib import closing from datetime import datetime @@ -10,7 +11,7 @@ from pathlib import Path import pytest import requests -from fixtures.common_types import Lsn, TenantId +from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.metrics import ( PAGESERVER_GLOBAL_METRICS, @@ -476,3 +477,34 @@ def test_pageserver_metrics_many_relations(neon_env_builder: NeonEnvBuilder): assert counts log.info(f"directory counts: {counts}") assert counts[2] > COUNT_AT_LEAST_EXPECTED + + +def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): + """ + (Relaxed) regression test for issue that led to https://github.com/neondatabase/neon/pull/9268 + Create many endpoints in parallel and then restart them + """ + env = neon_simple_env + + # This param needs to be 200+ to reproduce the limit issue + n_threads = 16 + barrier = threading.Barrier(n_threads) + + def test_timeline(branch_name: str, timeline_id: TimelineId): + endpoint = env.endpoints.create_start(branch_name) + endpoint.stop() + # Use a barrier to make sure we restart endpoints at the same time + barrier.wait() + endpoint.start() + + workers = [] + + for i in range(0, n_threads): + branch_name = f"branch_{i}" + timeline_id = env.create_branch(branch_name) + w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id]) + workers.append(w) + w.start() + + for w in workers: + w.join() From f4f7ea247c05a56a90e4a7f99249133c58c8c443 Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Mon, 14 Oct 2024 16:50:12 +0100 Subject: [PATCH 20/48] tests: make size comparisons more lenient (#9388) The empirically determined threshold doesn't hold for PG 17. Bump the limit to stabilise ci. --- test_runner/regress/test_tenant_size.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 9ea09d10d7..b41f1709bd 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -479,9 +479,9 @@ def assert_size_approx_equal(size_a, size_b): """ # Determined empirically from examples of equality failures: they differ - # by page multiples of 8272, and usually by 1-3 pages. Tolerate 4 to avoid + # by page multiples of 8272, and usually by 1-3 pages. Tolerate 6 to avoid # failing on outliers from that observed range. - threshold = 4 * 8272 + threshold = 6 * 8272 assert size_a == pytest.approx(size_b, abs=threshold) From f54e3e9147bf1dd341e22a0fc01cf5c5d71843e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 14 Oct 2024 17:54:03 +0200 Subject: [PATCH 21/48] Also consider offloaded timelines for obtaining retain_lsn (#9308) Also consider offloaded timelines for obtaining `retain_lsn`. This is required for correctness for all timelines that have not been flattened yet: otherwise we GC data that might still be required for reading. This somewhat counteracts the original purpose of timeline offloading of not having to iterate over offloaded timelines, but sadly it's required. In the future, we can improve the way the offloaded timelines are stored. We also make the `retain_lsn` optional so that in the future, when we implement flattening, we can make it None. This also applies to full timeline objects by the way, where it would probably make most sense to add a bool flag whether the timeline is successfully flattened, and if it is, one can exclude it from `retain_lsn` as well. Also, track whether a timeline was offloaded or not in `retain_lsn` so that the `retain_lsn` can be excluded from visibility and size calculation. Part of #8088 --- pageserver/src/tenant.rs | 56 ++++++++++++++++---- pageserver/src/tenant/size.rs | 8 +-- pageserver/src/tenant/timeline.rs | 21 +++++--- pageserver/src/tenant/timeline/compaction.rs | 9 ++-- 4 files changed, 71 insertions(+), 23 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index d2818d04dc..397778d4c8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -493,6 +493,8 @@ pub struct OffloadedTimeline { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub ancestor_timeline_id: Option, + /// Whether to retain the branch lsn at the ancestor or not + pub ancestor_retain_lsn: Option, // TODO: once we persist offloaded state, make this lazily constructed pub remote_client: Arc, @@ -504,10 +506,14 @@ pub struct OffloadedTimeline { impl OffloadedTimeline { fn from_timeline(timeline: &Timeline) -> Self { + let ancestor_retain_lsn = timeline + .get_ancestor_timeline_id() + .map(|_timeline_id| timeline.get_ancestor_lsn()); Self { tenant_shard_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, ancestor_timeline_id: timeline.get_ancestor_timeline_id(), + ancestor_retain_lsn, remote_client: timeline.remote_client.clone(), delete_progress: timeline.delete_progress.clone(), @@ -515,6 +521,12 @@ impl OffloadedTimeline { } } +#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)] +pub enum MaybeOffloaded { + Yes, + No, +} + #[derive(Clone)] pub enum TimelineOrOffloaded { Timeline(Arc), @@ -2253,12 +2265,13 @@ impl Tenant { if activating { let timelines_accessor = self.timelines.lock().unwrap(); + let timelines_offloaded_accessor = self.timelines_offloaded.lock().unwrap(); let timelines_to_activate = timelines_accessor .values() .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping())); // Before activation, populate each Timeline's GcInfo with information about its children - self.initialize_gc_info(&timelines_accessor); + self.initialize_gc_info(&timelines_accessor, &timelines_offloaded_accessor); // Spawn gc and compaction loops. The loops will shut themselves // down when they notice that the tenant is inactive. @@ -3298,6 +3311,7 @@ impl Tenant { fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, + timelines_offloaded: &std::sync::MutexGuard>>, ) { // This function must be called before activation: after activation timeline create/delete operations // might happen, and this function is not safe to run concurrently with those. @@ -3305,20 +3319,37 @@ impl Tenant { // Scan all timelines. For each timeline, remember the timeline ID and // the branch point where it was created. - let mut all_branchpoints: BTreeMap> = BTreeMap::new(); + let mut all_branchpoints: BTreeMap> = + BTreeMap::new(); timelines.iter().for_each(|(timeline_id, timeline_entry)| { if let Some(ancestor_timeline_id) = &timeline_entry.get_ancestor_timeline_id() { let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); - ancestor_children.push((timeline_entry.get_ancestor_lsn(), *timeline_id)); + ancestor_children.push(( + timeline_entry.get_ancestor_lsn(), + *timeline_id, + MaybeOffloaded::No, + )); } }); + timelines_offloaded + .iter() + .for_each(|(timeline_id, timeline_entry)| { + let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id else { + return; + }; + let Some(retain_lsn) = timeline_entry.ancestor_retain_lsn else { + return; + }; + let ancestor_children = all_branchpoints.entry(*ancestor_timeline_id).or_default(); + ancestor_children.push((retain_lsn, *timeline_id, MaybeOffloaded::Yes)); + }); // The number of bytes we always keep, irrespective of PITR: this is a constant across timelines let horizon = self.get_gc_horizon(); // Populate each timeline's GcInfo with information about its child branches for timeline in timelines.values() { - let mut branchpoints: Vec<(Lsn, TimelineId)> = all_branchpoints + let mut branchpoints: Vec<(Lsn, TimelineId, MaybeOffloaded)> = all_branchpoints .remove(&timeline.timeline_id) .unwrap_or_default(); @@ -4878,7 +4909,10 @@ mod tests { { let branchpoints = &tline.gc_info.read().unwrap().retain_lsns; assert_eq!(branchpoints.len(), 1); - assert_eq!(branchpoints[0], (Lsn(0x40), NEW_TIMELINE_ID)); + assert_eq!( + branchpoints[0], + (Lsn(0x40), NEW_TIMELINE_ID, MaybeOffloaded::No) + ); } // You can read the key from the child branch even though the parent is @@ -8261,8 +8295,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8489,8 +8523,8 @@ mod tests { let mut guard = tline.gc_info.write().unwrap(); *guard = GcInfo { retain_lsns: vec![ - (Lsn(0x10), tline.timeline_id), - (Lsn(0x20), tline.timeline_id), + (Lsn(0x10), tline.timeline_id, MaybeOffloaded::No), + (Lsn(0x20), tline.timeline_id, MaybeOffloaded::No), ], cutoffs: GcCutoffs { time: Lsn(0x30), @@ -8723,7 +8757,7 @@ mod tests { // Update GC info let mut guard = parent_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x18), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x10), space: Lsn(0x10), @@ -8737,7 +8771,7 @@ mod tests { // Update GC info let mut guard = branch_tline.gc_info.write().unwrap(); *guard = GcInfo { - retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id)], + retain_lsns: vec![(Lsn(0x40), branch_tline.timeline_id, MaybeOffloaded::No)], cutoffs: GcCutoffs { time: Lsn(0x50), space: Lsn(0x50), diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 41d558d3f6..4a4c698b56 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -12,7 +12,7 @@ use crate::context::RequestContext; use crate::pgdatadir_mapping::CalculateLogicalSizeError; use super::{GcError, LogicalSizeCalculationCause, Tenant}; -use crate::tenant::Timeline; +use crate::tenant::{MaybeOffloaded, Timeline}; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -264,10 +264,12 @@ pub(super) async fn gather_inputs( let mut lsns: Vec<(Lsn, LsnKind)> = gc_info .retain_lsns .iter() - .filter(|(lsn, _child_id)| lsn > &ancestor_lsn) + .filter(|(lsn, _child_id, is_offloaded)| { + lsn > &ancestor_lsn && *is_offloaded == MaybeOffloaded::No + }) .copied() // this assumes there are no other retain_lsns than the branchpoints - .map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint)) + .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint)) .collect::>(); lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2fd4e699cf..8f098d0e82 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -139,8 +139,10 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::{ - config::TenantConf, storage_layer::inmemory_layer, storage_layer::LayerVisibilityHint, + config::TenantConf, + storage_layer::{inmemory_layer, LayerVisibilityHint}, upload_queue::NotInitialized, + MaybeOffloaded, }; use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf}; use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe}; @@ -450,7 +452,7 @@ pub(crate) struct GcInfo { /// Currently, this includes all points where child branches have /// been forked off from. In the future, could also include /// explicit user-defined snapshot points. - pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>, + pub(crate) retain_lsns: Vec<(Lsn, TimelineId, MaybeOffloaded)>, /// The cutoff coordinates, which are combined by selecting the minimum. pub(crate) cutoffs: GcCutoffs, @@ -467,8 +469,13 @@ impl GcInfo { self.cutoffs.select_min() } - pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) { - self.retain_lsns.push((child_lsn, child_id)); + pub(super) fn insert_child( + &mut self, + child_id: TimelineId, + child_lsn: Lsn, + is_offloaded: MaybeOffloaded, + ) { + self.retain_lsns.push((child_lsn, child_id, is_offloaded)); self.retain_lsns.sort_by_key(|i| i.0); } @@ -2164,7 +2171,9 @@ impl Timeline { if let Some(ancestor) = &ancestor { let mut ancestor_gc_info = ancestor.gc_info.write().unwrap(); - ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn()); + // If we construct an explicit timeline object, it's obviously not offloaded + let is_offloaded = MaybeOffloaded::No; + ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn(), is_offloaded); } Arc::new_cyclic(|myself| { @@ -4875,7 +4884,7 @@ impl Timeline { let retain_lsns = gc_info .retain_lsns .iter() - .map(|(lsn, _child_id)| *lsn) + .map(|(lsn, _child_id, _is_offloaded)| *lsn) .collect(); // Gets the maximum LSN that holds the valid lease. diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 9f64471432..8b9ace1e5b 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -42,7 +42,7 @@ use crate::tenant::storage_layer::{ use crate::tenant::timeline::ImageLayerCreationOutcome; use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; -use crate::tenant::DeltaLayer; +use crate::tenant::{DeltaLayer, MaybeOffloaded}; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; use pageserver_api::config::tenant_conf_defaults::{ DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, @@ -639,7 +639,10 @@ impl Timeline { let children = self.gc_info.read().unwrap().retain_lsns.clone(); let mut readable_points = Vec::with_capacity(children.len() + 1); - for (child_lsn, _child_timeline_id) in &children { + for (child_lsn, _child_timeline_id, is_offloaded) in &children { + if *is_offloaded == MaybeOffloaded::Yes { + continue; + } readable_points.push(*child_lsn); } readable_points.push(head_lsn); @@ -1741,7 +1744,7 @@ impl Timeline { let gc_info = self.gc_info.read().unwrap(); let mut retain_lsns_below_horizon = Vec::new(); let gc_cutoff = gc_info.cutoffs.select_min(); - for (lsn, _timeline_id) in &gc_info.retain_lsns { + for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns { if lsn < &gc_cutoff { retain_lsns_below_horizon.push(*lsn); } From dab96a6eb159ffa34ff98f8dfc3b2a6862441e02 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Mon, 14 Oct 2024 20:30:21 +0200 Subject: [PATCH 22/48] Add more timing histogram and gauge metrics to the Neon extension (#9116) We now also track: - Number of PS IOs in-flight - Number of pages cached by smgr prefetch implementation - IO timing histograms for LFC reads and writes, per IO issued ## Problem There's little insight into the timing metrics of LFC, and what the prefetch state of each backend is. This changes that, by measuring (and subsequently exposing) these data points. ## Summary of changes - Extract IOHistogram as separate type, rather than a collection of fields on NeonMetrics - others, see items above. Part of https://github.com/neondatabase/neon/issues/8926 --- pgxn/neon/file_cache.c | 27 ++++- pgxn/neon/neon_perf_counters.c | 174 +++++++++++++++++++++------------ pgxn/neon/neon_perf_counters.h | 42 ++++++-- pgxn/neon/pagestore_smgr.c | 35 +++++++ 4 files changed, 205 insertions(+), 73 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index d789526050..bbea5a8b0d 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -43,6 +43,7 @@ #include "hll.h" #include "bitmap.h" #include "neon.h" +#include "neon_perf_counters.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -114,7 +115,9 @@ typedef struct FileCacheControl uint32 limit; /* shared copy of lfc_size_limit */ uint64 hits; uint64 misses; - uint64 writes; + uint64 writes; /* number of writes issued */ + uint64 time_read; /* time spent reading (us) */ + uint64 time_write; /* time spent writing (us) */ dlist_head lru; /* double linked list for LRU replacement * algorithm */ dlist_head holes; /* double linked list of punched holes */ @@ -270,6 +273,8 @@ lfc_shmem_startup(void) lfc_ctl->hits = 0; lfc_ctl->misses = 0; lfc_ctl->writes = 0; + lfc_ctl->time_read = 0; + lfc_ctl->time_write = 0; dlist_init(&lfc_ctl->lru); dlist_init(&lfc_ctl->holes); @@ -701,6 +706,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); int iteration_hits = 0; int iteration_misses = 0; + uint64 io_time_us = 0; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -795,6 +801,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->misses += iteration_misses; pgBufferUsage.file_cache.hits += iteration_hits; pgBufferUsage.file_cache.misses += iteration_misses; + + if (iteration_hits) + { + lfc_ctl->time_read += io_time_us; + inc_page_cache_read_wait(io_time_us); + } + CriticalAssert(entry->access_count > 0); if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); @@ -859,6 +872,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, struct iovec iov[PG_IOV_MAX]; int chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1); int blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK)); + instr_time io_start, io_end; Assert(blocks_in_chunk > 0); for (int i = 0; i < blocks_in_chunk; i++) @@ -947,12 +961,13 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, generation = lfc_ctl->generation; entry_offset = entry->offset; - lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); + INSTR_TIME_SET_CURRENT(io_start); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + INSTR_TIME_SET_CURRENT(io_end); pgstat_report_wait_end(); if (rc != BLCKSZ * blocks_in_chunk) @@ -965,9 +980,17 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (lfc_ctl->generation == generation) { + uint64 time_spent_us; CriticalAssert(LFC_ENABLED()); /* Place entry to the head of LRU list */ CriticalAssert(entry->access_count > 0); + + lfc_ctl->writes += blocks_in_chunk; + INSTR_TIME_SUBTRACT(io_start, io_end); + time_spent_us = INSTR_TIME_GET_MICROSEC(io_start); + lfc_ctl->time_write += time_spent_us; + inc_page_cache_write_wait(time_spent_us); + if (--entry->access_count == 0) dlist_push_tail(&lfc_ctl->lru, &entry->list_node); diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c index a497d387c8..05db187076 100644 --- a/pgxn/neon/neon_perf_counters.c +++ b/pgxn/neon/neon_perf_counters.c @@ -50,28 +50,52 @@ NeonPerfCountersShmemInit(void) } } -/* - * Count a GetPage wait operation. - */ -void -inc_getpage_wait(uint64 latency_us) +static inline void +inc_iohist(IOHistogram hist, uint64 latency_us) { int lo = 0; - int hi = NUM_GETPAGE_WAIT_BUCKETS - 1; + int hi = NUM_IO_WAIT_BUCKETS - 1; /* Find the right bucket with binary search */ while (lo < hi) { int mid = (lo + hi) / 2; - if (latency_us < getpage_wait_bucket_thresholds[mid]) + if (latency_us < io_wait_bucket_thresholds[mid]) hi = mid; else lo = mid + 1; } - MyNeonCounters->getpage_wait_us_bucket[lo]++; - MyNeonCounters->getpage_wait_us_sum += latency_us; - MyNeonCounters->getpage_wait_us_count++; + hist->wait_us_bucket[lo]++; + hist->wait_us_sum += latency_us; + hist->wait_us_count++; +} + +/* + * Count a GetPage wait operation. + */ +void +inc_getpage_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->getpage_hist, latency); +} + +/* + * Count an LFC read wait operation. + */ +void +inc_page_cache_read_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_read_hist, latency); +} + +/* + * Count an LFC write wait operation. + */ +void +inc_page_cache_write_wait(uint64 latency) +{ + inc_iohist(&MyNeonCounters->file_cache_write_hist, latency); } /* @@ -81,77 +105,91 @@ inc_getpage_wait(uint64 latency_us) typedef struct { - char *name; + const char *name; bool is_bucket; double bucket_le; double value; } metric_t; -static metric_t * -neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +static int +histogram_to_metrics(IOHistogram histogram, + metric_t *metrics, + const char *count, + const char *sum, + const char *bucket) { -#define NUM_METRICS (2 + NUM_GETPAGE_WAIT_BUCKETS + 8) - metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); - uint64 bucket_accum; - int i = 0; + int i = 0; + uint64 bucket_accum = 0; - metrics[i].name = "getpage_wait_seconds_count"; + metrics[i].name = count; metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_wait_us_count; + metrics[i].value = (double) histogram->wait_us_count; i++; - metrics[i].name = "getpage_wait_seconds_sum"; + metrics[i].name = sum; metrics[i].is_bucket = false; - metrics[i].value = ((double) counters->getpage_wait_us_sum) / 1000000.0; + metrics[i].value = (double) histogram->wait_us_sum / 1000000.0; i++; - - bucket_accum = 0; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) { - uint64 threshold = getpage_wait_bucket_thresholds[bucketno]; + uint64 threshold = io_wait_bucket_thresholds[bucketno]; - bucket_accum += counters->getpage_wait_us_bucket[bucketno]; + bucket_accum += histogram->wait_us_bucket[bucketno]; - metrics[i].name = "getpage_wait_seconds_bucket"; + metrics[i].name = bucket; metrics[i].is_bucket = true; metrics[i].bucket_le = (threshold == UINT64_MAX) ? INFINITY : ((double) threshold) / 1000000.0; metrics[i].value = (double) bucket_accum; i++; } - metrics[i].name = "getpage_prefetch_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_requests_total; - i++; - metrics[i].name = "getpage_sync_requests_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_sync_requests_total; - i++; - metrics[i].name = "getpage_prefetch_misses_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_misses_total; - i++; - metrics[i].name = "getpage_prefetch_discards_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->getpage_prefetch_discards_total; - i++; - metrics[i].name = "pageserver_requests_sent_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_requests_sent_total; - i++; - metrics[i].name = "pageserver_disconnects_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_disconnects_total; - i++; - metrics[i].name = "pageserver_send_flushes_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->pageserver_send_flushes_total; - i++; - metrics[i].name = "file_cache_hits_total"; - metrics[i].is_bucket = false; - metrics[i].value = (double) counters->file_cache_hits_total; - i++; + + return i; +} + +static metric_t * +neon_perf_counters_to_metrics(neon_per_backend_counters *counters) +{ +#define NUM_METRICS ((2 + NUM_IO_WAIT_BUCKETS) * 3 + 10) + metric_t *metrics = palloc((NUM_METRICS + 1) * sizeof(metric_t)); + int i = 0; + +#define APPEND_METRIC(_name) do { \ + metrics[i].name = #_name; \ + metrics[i].is_bucket = false; \ + metrics[i].value = (double) counters->_name; \ + i++; \ + } while (false) + + i += histogram_to_metrics(&counters->getpage_hist, &metrics[i], + "getpage_wait_seconds_count", + "getpage_wait_seconds_sum", + "getpage_wait_seconds_bucket"); + + APPEND_METRIC(getpage_prefetch_requests_total); + APPEND_METRIC(getpage_sync_requests_total); + APPEND_METRIC(getpage_prefetch_misses_total); + APPEND_METRIC(getpage_prefetch_discards_total); + APPEND_METRIC(pageserver_requests_sent_total); + APPEND_METRIC(pageserver_disconnects_total); + APPEND_METRIC(pageserver_send_flushes_total); + APPEND_METRIC(pageserver_open_requests); + APPEND_METRIC(getpage_prefetches_buffered); + + APPEND_METRIC(file_cache_hits_total); + + i += histogram_to_metrics(&counters->file_cache_read_hist, &metrics[i], + "file_cache_read_wait_seconds_count", + "file_cache_read_wait_seconds_sum", + "file_cache_read_wait_seconds_bucket"); + i += histogram_to_metrics(&counters->file_cache_write_hist, &metrics[i], + "file_cache_write_wait_seconds_count", + "file_cache_write_wait_seconds_sum", + "file_cache_write_wait_seconds_bucket"); Assert(i == NUM_METRICS); +#undef APPEND_METRIC +#undef NUM_METRICS + /* NULL entry marks end of array */ metrics[i].name = NULL; metrics[i].value = 0; @@ -216,6 +254,15 @@ neon_get_backend_perf_counters(PG_FUNCTION_ARGS) return (Datum) 0; } +static inline void +histogram_merge_into(IOHistogram into, IOHistogram from) +{ + into->wait_us_count += from->wait_us_count; + into->wait_us_sum += from->wait_us_sum; + for (int bucketno = 0; bucketno < NUM_IO_WAIT_BUCKETS; bucketno++) + into->wait_us_bucket[bucketno] += from->wait_us_bucket[bucketno]; +} + PG_FUNCTION_INFO_V1(neon_get_perf_counters); Datum neon_get_perf_counters(PG_FUNCTION_ARGS) @@ -234,10 +281,7 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) { neon_per_backend_counters *counters = &neon_per_backend_counters_shared[procno]; - totals.getpage_wait_us_count += counters->getpage_wait_us_count; - totals.getpage_wait_us_sum += counters->getpage_wait_us_sum; - for (int bucketno = 0; bucketno < NUM_GETPAGE_WAIT_BUCKETS; bucketno++) - totals.getpage_wait_us_bucket[bucketno] += counters->getpage_wait_us_bucket[bucketno]; + histogram_merge_into(&totals.getpage_hist, &counters->getpage_hist); totals.getpage_prefetch_requests_total += counters->getpage_prefetch_requests_total; totals.getpage_sync_requests_total += counters->getpage_sync_requests_total; totals.getpage_prefetch_misses_total += counters->getpage_prefetch_misses_total; @@ -245,7 +289,11 @@ neon_get_perf_counters(PG_FUNCTION_ARGS) totals.pageserver_requests_sent_total += counters->pageserver_requests_sent_total; totals.pageserver_disconnects_total += counters->pageserver_disconnects_total; totals.pageserver_send_flushes_total += counters->pageserver_send_flushes_total; + totals.pageserver_open_requests += counters->pageserver_open_requests; + totals.getpage_prefetches_buffered += counters->getpage_prefetches_buffered; totals.file_cache_hits_total += counters->file_cache_hits_total; + histogram_merge_into(&totals.file_cache_read_hist, &counters->file_cache_read_hist); + histogram_merge_into(&totals.file_cache_write_hist, &counters->file_cache_write_hist); } metrics = neon_perf_counters_to_metrics(&totals); diff --git a/pgxn/neon/neon_perf_counters.h b/pgxn/neon/neon_perf_counters.h index 49d477c4f8..8edc658a30 100644 --- a/pgxn/neon/neon_perf_counters.h +++ b/pgxn/neon/neon_perf_counters.h @@ -15,17 +15,26 @@ #include "storage/proc.h" #endif -static const uint64 getpage_wait_bucket_thresholds[] = { - 20, 30, 60, 100, /* 0 - 100 us */ +static const uint64 io_wait_bucket_thresholds[] = { + 2, 3, 6, 10, /* 0 us - 10 us */ + 20, 30, 60, 100, /* 10 us - 100 us */ 200, 300, 600, 1000, /* 100 us - 1 ms */ 2000, 3000, 6000, 10000, /* 1 ms - 10 ms */ 20000, 30000, 60000, 100000, /* 10 ms - 100 ms */ 200000, 300000, 600000, 1000000, /* 100 ms - 1 s */ 2000000, 3000000, 6000000, 10000000, /* 1 s - 10 s */ - 20000000, 30000000, 60000000, 100000000, /* 10 s - 100 s */ UINT64_MAX, }; -#define NUM_GETPAGE_WAIT_BUCKETS (lengthof(getpage_wait_bucket_thresholds)) +#define NUM_IO_WAIT_BUCKETS (lengthof(io_wait_bucket_thresholds)) + +typedef struct IOHistogramData +{ + uint64 wait_us_count; + uint64 wait_us_sum; + uint64 wait_us_bucket[NUM_IO_WAIT_BUCKETS]; +} IOHistogramData; + +typedef IOHistogramData *IOHistogram; typedef struct { @@ -39,9 +48,7 @@ typedef struct * the backend, but the 'neon_backend_perf_counters' view will convert * them to seconds, to make them more idiomatic as prometheus metrics. */ - uint64 getpage_wait_us_count; - uint64 getpage_wait_us_sum; - uint64 getpage_wait_us_bucket[NUM_GETPAGE_WAIT_BUCKETS]; + IOHistogramData getpage_hist; /* * Total number of speculative prefetch Getpage requests and synchronous @@ -50,7 +57,11 @@ typedef struct uint64 getpage_prefetch_requests_total; uint64 getpage_sync_requests_total; - /* XXX: It's not clear to me when these misses happen. */ + /* + * Total number of readahead misses; consisting of either prefetches that + * don't satisfy the LSN bounds, or cases where no readahead was issued + * for the read. + */ uint64 getpage_prefetch_misses_total; /* @@ -80,6 +91,16 @@ typedef struct * this can be smaller than pageserver_requests_sent_total. */ uint64 pageserver_send_flushes_total; + + /* + * Number of open requests to PageServer. + */ + uint64 pageserver_open_requests; + + /* + * Number of unused prefetches currently cached in this backend. + */ + uint64 getpage_prefetches_buffered; /* * Number of requests satisfied from the LFC. @@ -91,6 +112,9 @@ typedef struct */ uint64 file_cache_hits_total; + /* LFC I/O time buckets */ + IOHistogramData file_cache_read_hist; + IOHistogramData file_cache_write_hist; } neon_per_backend_counters; /* Pointer to the shared memory array of neon_per_backend_counters structs */ @@ -111,6 +135,8 @@ extern neon_per_backend_counters *neon_per_backend_counters_shared; #endif extern void inc_getpage_wait(uint64 latency); +extern void inc_page_cache_read_wait(uint64 latency); +extern void inc_page_cache_write_wait(uint64 latency); extern Size NeonPerfCountersShmemSize(void); extern void NeonPerfCountersShmemInit(void); diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 3d9d9285df..f46df7f70a 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -488,6 +488,11 @@ readahead_buffer_resize(int newsize, void *extra) newPState->n_unused -= 1; } + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) { prefetch_set_unused(end); @@ -621,6 +626,8 @@ prefetch_read(PrefetchRequest *slot) MyPState->n_responses_buffered += 1; MyPState->n_requests_inflight -= 1; MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; /* update slot state */ slot->status = PRFS_RECEIVED; @@ -674,6 +681,15 @@ prefetch_on_ps_disconnect(void) prefetch_set_unused(ring_index); } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } /* @@ -706,6 +722,9 @@ prefetch_set_unused(uint64 ring_index) MyPState->n_responses_buffered -= 1; MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; } else { @@ -820,6 +839,15 @@ prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, hashkey.buftag = tag; Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + min_ring_index = UINT64_MAX; for (int i = 0; i < nblocks; i++) { @@ -1001,6 +1029,9 @@ Retry: prefetch_do_request(slot, lsns); } + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + Assert(any_hits); Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || @@ -1076,8 +1107,10 @@ page_server_request(void const *req) { /* do nothing */ } + MyNeonCounters->pageserver_open_requests++; consume_prefetch_responses(); resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; } PG_CATCH(); { @@ -1086,6 +1119,8 @@ page_server_request(void const *req) * point, but this currently seems fine for now. */ page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + PG_RE_THROW(); } PG_END_TRY(); From 0fc4ada3ca9b1eb264bff9c6407ad050722578ae Mon Sep 17 00:00:00 2001 From: Alexander Bayandin Date: Mon, 14 Oct 2024 21:12:43 +0100 Subject: [PATCH 23/48] Switch CI, Storage and Proxy to Debian 12 (Bookworm) (#9170) ## Problem This PR switches CI and Storage to Debain 12 (Bookworm) based images. ## Summary of changes - Add Debian codename (`bookworm`/`bullseye`) to most of docker tags, create un-codenamed images to be used by default - `vm-compute-node-image`: create a separate spec for `bookworm` (we don't need to build cgroups in the future) - `neon-image`: Switch to `bookworm`-based `build-tools` image - Storage components and Proxy use it - CI: run lints and tests on `bookworm`-based `build-tools` image --- .../actions/allure-report-generate/action.yml | 2 +- .../actions/run-python-test-set/action.yml | 2 +- .github/workflows/_build-and-test-locally.yml | 8 +- .github/workflows/build-build-tools-image.yml | 31 ++-- .github/workflows/build_and_test.yml | 136 ++++++++++-------- .github/workflows/neon_extra_builds.yml | 2 +- .github/workflows/pg-clients.yml | 4 +- .github/workflows/pin-build-tools-image.yml | 23 ++- Dockerfile | 4 +- Dockerfile.build-tools | 19 +-- compute/Dockerfile.compute-node | 27 ++-- compute/vm-image-spec-bookworm.yaml | 126 ++++++++++++++++ ...-spec.yaml => vm-image-spec-bullseye.yaml} | 0 13 files changed, 280 insertions(+), 104 deletions(-) create mode 100644 compute/vm-image-spec-bookworm.yaml rename compute/{vm-image-spec.yaml => vm-image-spec-bullseye.yaml} (100%) diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index 11adc8df86..2bdb727719 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -183,7 +183,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 330e875d56..037b9aeb1e 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -88,7 +88,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 5fc6aa247a..3aa671fab1 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -124,28 +124,28 @@ jobs: uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v16 build id: cache_pg_16 uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v17 build id: cache_pg_17 uses: actions/cache@v4 with: path: pg_install/v17 - key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 130753833d..0f05276579 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -43,6 +43,7 @@ jobs: strategy: matrix: + debian-version: [ bullseye, bookworm ] arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -81,22 +82,22 @@ jobs: - uses: docker/build-push-action@v6 with: + file: Dockerfile.build-tools context: . provenance: false push: true pull: true - file: Dockerfile.build-tools - cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0},mode=max', matrix.arch) || '' }} - tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} + build-args: | + DEBIAN_VERSION=${{ matrix.debian-version }} + cache-from: type=registry,ref=cache.neon.build/build-tools:cache-${{ matrix.debian-version }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/build-tools:cache-{0}-{1},mode=max', matrix.debian-version, matrix.arch) || '' }} + tags: | + neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.debian-version }}-${{ matrix.arch }} merge-images: needs: [ build-image ] runs-on: ubuntu-22.04 - env: - IMAGE_TAG: ${{ inputs.image-tag }} - steps: - uses: docker/login-action@v3 with: @@ -104,7 +105,17 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - name: Create multi-arch image + env: + DEFAULT_DEBIAN_VERSION: bullseye + IMAGE_TAG: ${{ inputs.image-tag }} run: | - docker buildx imagetools create -t neondatabase/build-tools:${IMAGE_TAG} \ - neondatabase/build-tools:${IMAGE_TAG}-x64 \ - neondatabase/build-tools:${IMAGE_TAG}-arm64 + for debian_version in bullseye bookworm; do + tags=("-t" "neondatabase/build-tools:${IMAGE_TAG}-${debian_version}") + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${IMAGE_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-x64 \ + neondatabase/build-tools:${IMAGE_TAG}-${debian_version}-arm64 + done diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e7193cfe19..51f6975e63 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -92,7 +92,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -106,7 +106,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -181,7 +181,7 @@ jobs: runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }} container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -261,7 +261,7 @@ jobs: uses: ./.github/workflows/_build-and-test-locally.yml with: arch: ${{ matrix.arch }} - build-tools-image: ${{ needs.build-build-tools-image.outputs.image }} + build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm build-tag: ${{ needs.tag.outputs.build-tag }} build-type: ${{ matrix.build-type }} # Run tests on all Postgres versions in release builds and only on the latest version in debug builds @@ -276,7 +276,7 @@ jobs: needs: [ check-permissions, build-build-tools-image ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -289,7 +289,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -309,7 +309,7 @@ jobs: needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -367,7 +367,7 @@ jobs: runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -415,7 +415,7 @@ jobs: needs: [ check-permissions, build-build-tools-image, build-and-test-locally ] runs-on: [ self-hosted, small ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -559,15 +559,16 @@ jobs: ADDITIONAL_RUSTFLAGS=${{ matrix.arch == 'arm64' && '-Ctarget-feature=+lse -Ctarget-cpu=neoverse-n1' || '' }} GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-bookworm + DEBIAN_VERSION=bookworm provenance: false push: true pull: true file: Dockerfile - cache-from: type=registry,ref=cache.neon.build/neon:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0},mode=max', matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon:cache-bookworm-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon:cache-{0}-{1},mode=max', 'bookworm', matrix.arch) || '' }} tags: | - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-${{ matrix.arch }} neon-image: needs: [ neon-image-arch, tag ] @@ -582,8 +583,9 @@ jobs: - name: Create multi-arch image run: | docker buildx imagetools create -t neondatabase/neon:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-x64 \ + neondatabase/neon:${{ needs.tag.outputs.build-tag }}-bookworm-arm64 - uses: docker/login-action@v3 with: @@ -604,17 +606,16 @@ jobs: version: # Much data was already generated on old PG versions with bullseye's # libraries, the locales of which can cause data incompatibilities. - # However, new PG versions should check if they can be built on newer - # images, as that reduces the support burden of old and ancient - # distros. + # However, new PG versions should be build on newer images, + # as that reduces the support burden of old and ancient distros. - pg: v14 - debian: bullseye-slim + debian: bullseye - pg: v15 - debian: bullseye-slim + debian: bullseye - pg: v16 - debian: bullseye-slim + debian: bullseye - pg: v17 - debian: bookworm-slim + debian: bookworm arch: [ x64, arm64 ] runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }} @@ -659,16 +660,16 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node - cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build neon extensions test image if: matrix.version.pg == 'v16' @@ -679,17 +680,17 @@ jobs: GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} PG_VERSION=${{ matrix.version.pg }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node target: neon-pg-ext-test - cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.arch }} - cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version.pg, matrix.arch) || '' }} + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once @@ -704,14 +705,16 @@ jobs: build-args: | GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} BUILD_TAG=${{ needs.tag.outputs.build-tag }} - TAG=${{ needs.build-build-tools-image.outputs.image-tag }} - DEBIAN_FLAVOR=${{ matrix.version.debian }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }}-${{ matrix.version.debian }} + DEBIAN_VERSION=${{ matrix.version.debian }} provenance: false push: true pull: true file: compute/Dockerfile.compute-node + cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }} + cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }} tags: | - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }} compute-node-image: needs: [ compute-node-image-arch, tag ] @@ -719,7 +722,16 @@ jobs: strategy: matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm steps: - uses: docker/login-action@v3 @@ -729,23 +741,26 @@ jobs: - name: Create multi-arch compute-node image run: | - docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch neon-test-extensions image - if: matrix.version == 'v16' + if: matrix.version.pg == 'v16' run: | - docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - name: Create multi-arch compute-tools image - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-x64 \ - neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-arm64 + -t neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \ + neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64 - uses: docker/login-action@v3 with: @@ -753,13 +768,13 @@ jobs: username: ${{ secrets.AWS_ACCESS_KEY_DEV }} password: ${{ secrets.AWS_SECRET_KEY_DEV }} - - name: Push multi-arch compute-node-${{ matrix.version }} image to ECR + - name: Push multi-arch compute-node-${{ matrix.version.pg }} image to ECR run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Push multi-arch compute-tools image to ECR - if: matrix.version == 'v17' + if: matrix.version.pg == 'v16' run: | docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{ needs.tag.outputs.build-tag }} \ neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }} @@ -770,7 +785,16 @@ jobs: strategy: fail-fast: false matrix: - version: [ v14, v15, v16, v17 ] + version: + # see the comment for `compute-node-image-arch` job + - pg: v14 + debian: bullseye + - pg: v15 + debian: bullseye + - pg: v16 + debian: bullseye + - pg: v17 + debian: bookworm env: VM_BUILDER_VERSION: v0.35.0 @@ -792,18 +816,18 @@ jobs: # it won't have the proper authentication (written at v0.6.0) - name: Pulling compute-node image run: | - docker pull neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker pull neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Build vm image run: | ./vm-builder \ - -spec=compute/vm-image-spec.yaml \ - -src=neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ - -dst=neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ + -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ + -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} - name: Pushing vm-compute-node image run: | - docker push neondatabase/vm-compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} + docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} test-images: needs: [ check-permissions, tag, neon-image, compute-node-image ] diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 140aac032a..287c9ea281 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -155,7 +155,7 @@ jobs: github.ref_name == 'main' runs-on: [ self-hosted, large ] container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml index 23a2e3876c..df40b5beda 100644 --- a/.github/workflows/pg-clients.yml +++ b/.github/workflows/pg-clients.yml @@ -55,7 +55,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -150,7 +150,7 @@ jobs: runs-on: ubuntu-22.04 container: - image: ${{ needs.build-build-tools-image.outputs.image }} + image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm credentials: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} diff --git a/.github/workflows/pin-build-tools-image.yml b/.github/workflows/pin-build-tools-image.yml index 2e79498fc4..c196d07d3e 100644 --- a/.github/workflows/pin-build-tools-image.yml +++ b/.github/workflows/pin-build-tools-image.yml @@ -71,7 +71,6 @@ jobs: steps: - uses: docker/login-action@v3 - with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} @@ -94,8 +93,22 @@ jobs: az acr login --name=neoneastus2 - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR + env: + DEFAULT_DEBIAN_VERSION: bullseye run: | - docker buildx imagetools create -t 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG} \ - -t neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG} \ - -t neondatabase/build-tools:${TO_TAG} \ - neondatabase/build-tools:${FROM_TAG} + for debian_version in bullseye bookworm; do + tags=() + + tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}") + + if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then + tags+=("-t" "neondatabase/build-tools:${TO_TAG}") + tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}") + tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}") + fi + + docker buildx imagetools create "${tags[@]}" \ + neondatabase/build-tools:${FROM_TAG}-${debian_version} + done diff --git a/Dockerfile b/Dockerfile index bdb76a4f4f..785dd4598e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,8 @@ ARG IMAGE=build-tools ARG TAG=pinned ARG DEFAULT_PG_VERSION=17 ARG STABLE_PG_VERSION=16 +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim # Build Postgres FROM $REPOSITORY/$IMAGE:$TAG AS pg-build @@ -57,7 +59,7 @@ RUN set -e \ # Build final image # -FROM debian:bullseye-slim +FROM debian:${DEBIAN_FLAVOR} ARG DEFAULT_PG_VERSION WORKDIR /data diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index d8bcacf228..54e9134257 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -1,12 +1,7 @@ -FROM debian:bullseye-slim +ARG DEBIAN_VERSION=bullseye -# Use ARG as a build-time environment variable here to allow. -# It's not supposed to be set outside. -# Alternatively it can be obtained using the following command -# ``` -# . /etc/os-release && echo "${VERSION_CODENAME}" -# ``` -ARG DEBIAN_VERSION_CODENAME=bullseye +FROM debian:${DEBIAN_VERSION}-slim +ARG DEBIAN_VERSION # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home @@ -42,14 +37,14 @@ RUN set -e \ libseccomp-dev \ libsqlite3-dev \ libssl-dev \ - libstdc++-10-dev \ + $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \ libtool \ libxml2-dev \ libxmlsec1-dev \ libxxhash-dev \ lsof \ make \ - netcat \ + netcat-openbsd \ net-tools \ openssh-client \ parallel \ @@ -78,7 +73,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ # LLVM ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ @@ -86,7 +81,7 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ # Install docker RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \ && apt update \ && apt install -y docker-ce docker-ce-cli \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 15afb9897f..91528618da 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -3,7 +3,8 @@ ARG REPOSITORY=neondatabase ARG IMAGE=build-tools ARG TAG=pinned ARG BUILD_TAG -ARG DEBIAN_FLAVOR=bullseye-slim +ARG DEBIAN_VERSION=bullseye +ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim ######################################################################################### # @@ -11,20 +12,23 @@ ARG DEBIAN_FLAVOR=bullseye-slim # ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS build-deps -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION -RUN case $DEBIAN_FLAVOR in \ +RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. # Install newer version (3.25) from backports. - bullseye*) \ + bullseye) \ echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ ;; \ # Version-specific installs for Bookworm (PG17): - bookworm*) \ + bookworm) \ VERSION_INSTALLS="cmake"; \ ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ esac && \ apt update && \ apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \ @@ -1091,7 +1095,6 @@ RUN cd compute_tools && mold -run cargo build --locked --profile release-line-de ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS compute-tools-image -ARG DEBIAN_FLAVOR COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl @@ -1102,7 +1105,6 @@ COPY --from=compute-tools /home/nonroot/target/release-line-debug-size-lto/compu ######################################################################################### FROM debian:$DEBIAN_FLAVOR AS pgbouncer -ARG DEBIAN_FLAVOR RUN set -e \ && apt-get update \ && apt-get install --no-install-recommends -y \ @@ -1257,7 +1259,7 @@ ENV PGDATABASE=postgres # ######################################################################################### FROM debian:$DEBIAN_FLAVOR -ARG DEBIAN_FLAVOR +ARG DEBIAN_VERSION # Add user postgres RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \ echo "postgres:test_console_pass" | chpasswd && \ @@ -1305,19 +1307,22 @@ RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/loca RUN apt update && \ - case $DEBIAN_FLAVOR in \ + case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # libicu67, locales for collations (including ICU and plpgsql_check) # libgdal28, libproj19 for PostGIS - bullseye*) \ + bullseye) \ VERSION_INSTALLS="libicu67 libgdal28 libproj19"; \ ;; \ # Version-specific installs for Bookworm (PG17): # libicu72, locales for collations (including ICU and plpgsql_check) # libgdal32, libproj25 for PostGIS - bookworm*) \ + bookworm) \ VERSION_INSTALLS="libicu72 libgdal32 libproj25"; \ ;; \ + *) \ + echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ + ;; \ esac && \ apt install --no-install-recommends -y \ gdb \ diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml new file mode 100644 index 0000000000..51a55b513f --- /dev/null +++ b/compute/vm-image-spec-bookworm.yaml @@ -0,0 +1,126 @@ +# Supplemental file for neondatabase/autoscaling's vm-builder, for producing the VM compute image. +--- +commands: + - name: cgconfigparser + user: root + sysvInitAction: sysinit + shell: 'cgconfigparser -l /etc/cgconfig.conf -s 1664' + # restrict permissions on /neonvm/bin/resize-swap, because we grant access to compute_ctl for + # running it as root. + - name: chmod-resize-swap + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/resize-swap' + - name: chmod-set-disk-quota + user: root + sysvInitAction: sysinit + shell: 'chmod 711 /neonvm/bin/set-disk-quota' + - name: pgbouncer + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' + - name: local_proxy + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' + - name: postgres-exporter + user: nobody + sysvInitAction: respawn + shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter' + - name: sql-exporter + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml -web.listen-address=:9399' + - name: sql-exporter-autoscaling + user: nobody + sysvInitAction: respawn + shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499' +shutdownHook: | + su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10' +files: + - filename: compute_ctl-sudoers + content: | + # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap + # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD), + # regardless of hostname (ALL) + postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota + - filename: cgconfig.conf + content: | + # Configuration for cgroups in VM compute nodes + group neon-postgres { + perm { + admin { + uid = postgres; + } + task { + gid = users; + } + } + memory {} + } +build: | + # Build cgroup-tools + # + # At time of writing (2023-03-14), debian bullseye has a version of cgroup-tools (technically + # libcgroup) that doesn't support cgroup v2 (version 0.41-11). Unfortunately, the vm-monitor + # requires cgroup v2, so we'll build cgroup-tools ourselves. + # + # At time of migration to bookworm (2024-10-09), debian has a version of libcgroup/cgroup-tools 2.0.2, + # and it _probably_ can be used as-is. However, we'll build it ourselves to minimise the changeset + # for debian version migration. + # + FROM debian:bookworm-slim as libcgroup-builder + ENV LIBCGROUP_VERSION=v2.0.3 + + RUN set -exu \ + && apt update \ + && apt install --no-install-recommends -y \ + git \ + ca-certificates \ + automake \ + cmake \ + make \ + gcc \ + byacc \ + flex \ + libtool \ + libpam0g-dev \ + && git clone --depth 1 -b $LIBCGROUP_VERSION https://github.com/libcgroup/libcgroup \ + && INSTALL_DIR="/libcgroup-install" \ + && mkdir -p "$INSTALL_DIR/bin" "$INSTALL_DIR/include" \ + && cd libcgroup \ + # extracted from bootstrap.sh, with modified flags: + && (test -d m4 || mkdir m4) \ + && autoreconf -fi \ + && rm -rf autom4te.cache \ + && CFLAGS="-O3" ./configure --prefix="$INSTALL_DIR" --sysconfdir=/etc --localstatedir=/var --enable-opaque-hierarchy="name=systemd" \ + # actually build the thing... + && make install +merge: | + # tweak nofile limits + RUN set -e \ + && echo 'fs.file-max = 1048576' >>/etc/sysctl.conf \ + && test ! -e /etc/security || ( \ + echo '* - nofile 1048576' >>/etc/security/limits.conf \ + && echo 'root - nofile 1048576' >>/etc/security/limits.conf \ + ) + + # Allow postgres user (compute_ctl) to run swap resizer. + # Need to install sudo in order to allow this. + # + # Also, remove the 'read' permission from group/other on /neonvm/bin/resize-swap, just to be safe. + RUN set -e \ + && apt update \ + && apt install --no-install-recommends -y \ + sudo \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + COPY compute_ctl-sudoers /etc/sudoers.d/compute_ctl-sudoers + + COPY cgconfig.conf /etc/cgconfig.conf + + RUN set -e \ + && chmod 0644 /etc/cgconfig.conf + + COPY --from=libcgroup-builder /libcgroup-install/bin/* /usr/bin/ + COPY --from=libcgroup-builder /libcgroup-install/lib/* /usr/lib/ + COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/ diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec-bullseye.yaml similarity index 100% rename from compute/vm-image-spec.yaml rename to compute/vm-image-spec-bullseye.yaml From 73c6626b381bd013064d72332c3a0a372c555877 Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 15 Oct 2024 09:31:18 +0100 Subject: [PATCH 24/48] pageserver: stabilize & refine controller scale test (#8971) ## Problem We were seeing timeouts on migrations in this test. The test unfortunately tends to saturate local storage, which is shared between the pageservers and the control plane database, which makes the test kind of unrealistic. We will also want to increase the scale of this test, so it's worth fixing that. ## Summary of changes - Instead of randomly creating timelines at the same time as the other background operations, explicitly identify a subset of tenant which will have timelines, and create them at the start. This avoids pageservers putting a lot of load on the test node during the main body of the test. - Adjust the tenants created to create some number of 8 shard tenants and the rest 1 shard tenants, instead of just creating a lot of 2 shard tenants. - Use archival_config to exercise tenant-mutating operations, instead of using timeline creation for this. - Adjust reconcile_until_idle calls to avoid waiting 5 seconds between calls, which causes timelines with large shard count tenants. - Fix a pageserver bug where calls to archival_config during activation get 404 --- libs/utils/src/http/error.rs | 7 + pageserver/src/http/routes.rs | 2 + proxy/src/serverless/http_util.rs | 4 + storage_controller/src/service.rs | 5 + test_runner/fixtures/neon_fixtures.py | 6 +- .../test_storage_controller_scale.py | 225 ++++++++++++++---- 6 files changed, 204 insertions(+), 45 deletions(-) diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs index 5e05e4e713..02fc9e3b99 100644 --- a/libs/utils/src/http/error.rs +++ b/libs/utils/src/http/error.rs @@ -28,6 +28,9 @@ pub enum ApiError { #[error("Resource temporarily unavailable: {0}")] ResourceUnavailable(Cow<'static, str>), + #[error("Too many requests: {0}")] + TooManyRequests(Cow<'static, str>), + #[error("Shutting down")] ShuttingDown, @@ -73,6 +76,10 @@ impl ApiError { err.to_string(), StatusCode::SERVICE_UNAVAILABLE, ), + ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2985ab1efb..1079d8df29 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -715,6 +715,8 @@ async fn timeline_archival_config_handler( .tenant_manager .get_attached_tenant_shard(tenant_shard_id)?; + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + tenant .apply_timeline_archival_config(timeline_id, request_data.state, ctx) .await?; diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index 87a72ec5f0..c1c5764d17 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -41,6 +41,10 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response HttpErrorBody::response_from_msg_and_status( + err.to_string(), + StatusCode::TOO_MANY_REQUESTS, + ), ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status( err.to_string(), StatusCode::REQUEST_TIMEOUT, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index cc735dc27e..cedee54534 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -246,6 +246,11 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { // storage controller's auth configuration. ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}")) } + mgmt_api::Error::ApiError(status @ StatusCode::TOO_MANY_REQUESTS, msg) => { + // Pass through 429 errors: if pageserver is asking us to wait + retry, we in + // turn ask our clients to wait + retry + ApiError::Conflict(format!("{node} {status}: {status} {msg}")) + } mgmt_api::Error::ApiError(status, msg) => { // Presume general case of pageserver API errors is that we tried to do something // that can't be done right now. diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 059707c8ed..a313ac2ed3 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1986,11 +1986,11 @@ class NeonStorageController(MetricsGetter, LogUtils): log.info(f"reconcile_all waited for {n} shards") return n - def reconcile_until_idle(self, timeout_secs=30): + def reconcile_until_idle(self, timeout_secs=30, max_interval=5): start_at = time.time() n = 1 - delay_sec = 0.5 - delay_max = 5 + delay_sec = 0.1 + delay_max = max_interval while n > 0: n = self.reconcile_all() if n == 0: diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index 452a856714..d2eba751f8 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -4,9 +4,10 @@ import concurrent.futures import random import time from collections import defaultdict +from enum import Enum import pytest -from fixtures.common_types import TenantId, TenantShardId, TimelineId +from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log from fixtures.neon_fixtures import ( @@ -34,6 +35,7 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[ if tenant_placement[tid]["intent"]["attached"] == tenant_placement[tid]["observed"]["attached"] } + assert len(matching) == total_shards attached_per_node: defaultdict[str, int] = defaultdict(int) @@ -107,15 +109,48 @@ def test_storage_controller_many_tenants( ps.allowed_errors.append(".*request was dropped before completing.*") # Total tenants - tenant_count = 4000 + small_tenant_count = 7800 + large_tenant_count = 200 + tenant_count = small_tenant_count + large_tenant_count + large_tenant_shard_count = 8 + total_shards = small_tenant_count + large_tenant_count * large_tenant_shard_count - # Shards per tenant - shard_count = 2 - stripe_size = 1024 + # A small stripe size to encourage all shards to get some data + stripe_size = 1 - total_shards = tenant_count * shard_count + # We use a fixed seed to make the test somewhat reproducible: we want a randomly + # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. + rng = random.Random(1234) - tenants = set(TenantId.generate() for _i in range(0, tenant_count)) + class Tenant: + def __init__(self): + # Tenants may optionally contain a timeline + self.timeline_id = None + + # Tenants may be marked as 'large' to get multiple shard during creation phase + self.large = False + + tenant_ids = list(TenantId.generate() for _i in range(0, tenant_count)) + tenants = dict((tid, Tenant()) for tid in tenant_ids) + + # We will create timelines in only a subset of tenants, because creating timelines + # does many megabytes of IO, and we want to densely simulate huge tenant counts on + # a single test node. + tenant_timelines_count = 100 + + # These lists are maintained for use with rng.choice + tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count)) + tenants_without_timelines = list( + tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines + ) + + # For our sharded tenants, we will make half of them with timelines and half without + assert large_tenant_count >= tenant_timelines_count / 2 + for tenant_id in tenants_with_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True + + for tenant_id in tenants_without_timelines[0 : large_tenant_count // 2]: + tenants[tenant_id].large = True virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -125,23 +160,39 @@ def test_storage_controller_many_tenants( rss = env.storage_controller.get_metric_value("process_resident_memory_bytes") assert rss is not None - log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)") - assert rss < expect_memory_per_shard * shard_count * tenant_count - - # We use a fixed seed to make the test somewhat reproducible: we want a randomly - # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run. - rng = random.Random(1234) + log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)") + assert rss < expect_memory_per_shard * total_shards # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore # permits, to ensure that we are exercising stressing that. api_concurrency = 135 - # We will create tenants directly via API, not via neon_local, to avoid any false - # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) - with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: - futs = [] + # A different concurrency limit for bulk tenant+timeline creations: these do I/O and will + # start timing on test nodes if we aren't a bit careful. + create_concurrency = 16 + + class Operation(str, Enum): + TIMELINE_OPS = "timeline_ops" + SHARD_MIGRATE = "shard_migrate" + TENANT_PASSTHROUGH = "tenant_passthrough" + + run_ops = api_concurrency * 4 + assert run_ops < len(tenants) + + # Creation phase: make a lot of tenants, and create timelines in a subset of them + # This executor has concurrency set modestly, to avoid overloading pageservers with timeline creations. + with concurrent.futures.ThreadPoolExecutor(max_workers=create_concurrency) as executor: + tenant_create_futs = [] t1 = time.time() - for tenant_id in tenants: + + for tenant_id, tenant in tenants.items(): + if tenant.large: + shard_count = large_tenant_shard_count + else: + shard_count = 1 + + # We will create tenants directly via API, not via neon_local, to avoid any false + # serialization of operations in neon_local (it e.g. loads/saves a config file on each call) f = executor.submit( env.storage_controller.tenant_create, tenant_id, @@ -152,44 +203,106 @@ def test_storage_controller_many_tenants( tenant_config={"heatmap_period": "10s"}, placement_policy={"Attached": 1}, ) - futs.append(f) + tenant_create_futs.append(f) - # Wait for creations to finish - for f in futs: + # Wait for tenant creations to finish + for f in tenant_create_futs: f.result() log.info( f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s" ) - run_ops = api_concurrency * 4 - assert run_ops < len(tenants) - op_tenants = list(tenants)[0:run_ops] + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + + # Create timelines in those tenants which are going to get one + t1 = time.time() + timeline_create_futs = [] + for tenant_id in tenants_with_timelines: + timeline_id = TimelineId.generate() + tenants[tenant_id].timeline_id = timeline_id + f = executor.submit( + env.storage_controller.pageserver_api().timeline_create, + PgVersion.NOT_SET, + tenant_id, + timeline_id, + ) + timeline_create_futs.append(f) + + for f in timeline_create_futs: + f.result() + log.info( + f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s" + ) + + # Plan operations: ensure each tenant with a timeline gets at least + # one of each operation type. Then add other tenants to make up the + # numbers. + ops_plan = [] + for tenant_id in tenants_with_timelines: + ops_plan.append((tenant_id, Operation.TIMELINE_OPS)) + ops_plan.append((tenant_id, Operation.SHARD_MIGRATE)) + ops_plan.append((tenant_id, Operation.TENANT_PASSTHROUGH)) + + # Fill up remaining run_ops with migrations of tenants without timelines + other_migrate_tenants = rng.sample(tenants_without_timelines, run_ops - len(ops_plan)) + + for tenant_id in other_migrate_tenants: + ops_plan.append( + ( + tenant_id, + rng.choice([Operation.SHARD_MIGRATE, Operation.TENANT_PASSTHROUGH]), + ) + ) + + # Exercise phase: pick pseudo-random operations to do on the tenants + timelines + # This executor has concurrency high enough to stress the storage controller API. + with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor: + + def exercise_timeline_ops(tenant_id, timeline_id): + # A read operation: this requires looking up shard zero and routing there + detail = virtual_ps_http.timeline_detail(tenant_id, timeline_id) + assert detail["timeline_id"] == str(timeline_id) + + # A fan-out write operation to all shards in a tenant. + # - We use a metadata operation rather than something like a timeline create, because + # timeline creations are I/O intensive and this test isn't meant to be a stress test for + # doing lots of concurrent timeline creations. + archival_state = rng.choice( + [TimelineArchivalState.ARCHIVED, TimelineArchivalState.UNARCHIVED] + ) + virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state) # Generate a mixture of operations and dispatch them all concurrently futs = [] - for tenant_id in op_tenants: - op = rng.choice([0, 1, 2]) - if op == 0: - # A fan-out write operation to all shards in a tenant (timeline creation) + for tenant_id, op in ops_plan: + if op == Operation.TIMELINE_OPS: + op_timeline_id = tenants[tenant_id].timeline_id + assert op_timeline_id is not None + + # Exercise operations that modify tenant scheduling state but require traversing + # the fan-out-to-all-shards functionality. f = executor.submit( - virtual_ps_http.timeline_create, - PgVersion.NOT_SET, + exercise_timeline_ops, tenant_id, - TimelineId.generate(), + op_timeline_id, ) - elif op == 1: + elif op == Operation.SHARD_MIGRATE: # A reconciler operation: migrate a shard. - shard_number = rng.randint(0, shard_count - 1) - tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count) + desc = env.storage_controller.tenant_describe(tenant_id) + + shard_number = rng.randint(0, len(desc["shards"]) - 1) + tenant_shard_id = TenantShardId(tenant_id, shard_number, len(desc["shards"])) # Migrate it to its secondary location - desc = env.storage_controller.tenant_describe(tenant_id) dest_ps_id = desc["shards"][shard_number]["node_secondary"][0] f = executor.submit( env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id ) - elif op == 2: + elif op == Operation.TENANT_PASSTHROUGH: # A passthrough read to shard zero f = executor.submit(virtual_ps_http.tenant_status, tenant_id) @@ -199,10 +312,18 @@ def test_storage_controller_many_tenants( for f in futs: f.result() + log.info("Completed mixed operations phase") + # Some of the operations above (notably migrations) might leave the controller in a state where it has # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system # to reach a quiescent state before doing following checks. - env.storage_controller.reconcile_until_idle() + # + # - Set max_interval low because we probably have a significant number of optimizations to complete and would like + # the test to run quickly. + # - Set timeout high because we might be waiting for optimizations that reuqire a secondary + # to warm up, and if we just started a secondary in the previous step, it might wait some time + # before downloading its heatmap + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() check_memory() @@ -213,6 +334,7 @@ def test_storage_controller_many_tenants( # # We do not require that the system is quiescent already here, although at present in this point in the test # that may be the case. + log.info("Reconciling all & timing") while True: t1 = time.time() reconcilers = env.storage_controller.reconcile_all() @@ -225,6 +347,7 @@ def test_storage_controller_many_tenants( break # Restart the storage controller + log.info("Restarting controller") env.storage_controller.stop() env.storage_controller.start() @@ -246,7 +369,16 @@ def test_storage_controller_many_tenants( # Restart pageservers gracefully: this exercises the /re-attach pageserver API # and the storage controller drain and fill API + log.info("Restarting pageservers...") + + # Parameters for how long we expect it to take to migrate all of the tenants from/to + # a node during a drain/fill operation + DRAIN_FILL_TIMEOUT = 240 + DRAIN_FILL_BACKOFF = 5 + for ps in env.pageservers: + log.info(f"Draining pageserver {ps.id}") + t1 = time.time() env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -255,9 +387,10 @@ def test_storage_controller_many_tenants( ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.PAUSE_FOR_RESTART, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Drained pageserver {ps.id} in {time.time() - t1}s") shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -275,6 +408,7 @@ def test_storage_controller_many_tenants( backoff=1, ) + log.info(f"Filling pageserver {ps.id}") env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) @@ -282,16 +416,23 @@ def test_storage_controller_many_tenants( ps.id, PageserverAvailability.ACTIVE, PageserverSchedulingPolicy.ACTIVE, - max_attempts=24, - backoff=5, + max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF, + backoff=DRAIN_FILL_BACKOFF, ) + log.info(f"Filled pageserver {ps.id} in {time.time() - t1}s") + + # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior + # would be for original scheduling decisions to always match optimizer's preference) + # (workaround for https://github.com/neondatabase/neon/issues/8969) + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) + shard_counts = get_consistent_node_shard_counts(env, total_shards) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") assert_consistent_balanced_attachments(env, total_shards) - env.storage_controller.reconcile_until_idle() + env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) env.storage_controller.consistency_check() # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, From ec4cc30de9bc1140761a7f8b7e4a5886c4d3b4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Tue, 15 Oct 2024 11:46:51 +0200 Subject: [PATCH 25/48] Shut down timelines during offload and add offload tests (#9289) Add a test for timeline offloading, and subsequent unoffloading. Also adds a manual endpoint, and issues a proper timeline shutdown during offloading which prevents a pageserver hang at shutdown. Part of #8088. --- pageserver/src/http/routes.rs | 49 ++++++++++++ pageserver/src/tenant.rs | 29 +++++++ pageserver/src/tenant/timeline/offload.rs | 3 + test_runner/fixtures/pageserver/http.py | 16 ++++ test_runner/regress/test_timeline_archive.py | 84 ++++++++++++++++++++ 5 files changed, 181 insertions(+) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1079d8df29..dd403c1cef 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::LayerAccessStatsReset; use crate::tenant::storage_layer::LayerName; +use crate::tenant::timeline::offload::offload_timeline; use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; @@ -325,6 +326,7 @@ impl From for ApiError { match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + Cancelled => ApiError::ShuttingDown, e @ HasArchivedParent(_) => { ApiError::PreconditionFailed(e.to_string().into_boxed_str()) } @@ -1785,6 +1787,49 @@ async fn timeline_compact_handler( .await } +// Run offload immediately on given timeline. +async fn timeline_offload_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let state = get_state(&request); + + async { + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + if tenant.get_offloaded_timeline(timeline_id).is_ok() { + return json_response(StatusCode::OK, ()); + } + let timeline = + active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id) + .await?; + + if !tenant.timeline_has_no_attached_children(timeline_id) { + return Err(ApiError::PreconditionFailed( + "timeline has attached children".into(), + )); + } + if !timeline.can_offload() { + return Err(ApiError::PreconditionFailed( + "Timeline::can_offload() returned false".into(), + )); + } + offload_timeline(&tenant, &timeline) + .await + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, ()) + } + .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id)) + .await +} + // Run checkpoint immediately on given timeline. async fn timeline_checkpoint_handler( request: Request, @@ -3008,6 +3053,10 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact", |r| api_handler(r, timeline_compact_handler), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload", + |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler), + ) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint", |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 397778d4c8..44d1bb74ca 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -619,6 +619,9 @@ pub enum TimelineArchivalError { #[error("Timeout")] Timeout, + #[error("Cancelled")] + Cancelled, + #[error("ancestor is archived: {}", .0)] HasArchivedParent(TimelineId), @@ -637,6 +640,7 @@ impl Debug for TimelineArchivalError { match self { Self::NotFound => write!(f, "NotFound"), Self::Timeout => write!(f, "Timeout"), + Self::Cancelled => write!(f, "Cancelled"), Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), Self::HasUnarchivedChildren(c) => { f.debug_tuple("HasUnarchivedChildren").field(c).finish() @@ -1552,6 +1556,7 @@ impl Tenant { timeline_id: TimelineId, ctx: RequestContext, ) -> Result, TimelineArchivalError> { + info!("unoffloading timeline"); let cancel = self.cancel.clone(); let timeline_preload = self .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel) @@ -1566,6 +1571,7 @@ impl Tenant { error!(%timeline_id, "index_part not found on remote"); return Err(TimelineArchivalError::NotFound); } + Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled), Err(e) => { // Some (possibly ephemeral) error happened during index_part download. warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})"); @@ -1603,6 +1609,7 @@ impl Tenant { if offloaded_timelines.remove(&timeline_id).is_none() { warn!("timeline already removed from offloaded timelines"); } + info!("timeline unoffloading complete"); Ok(Arc::clone(timeline)) } else { warn!("timeline not available directly after attach"); @@ -1683,6 +1690,21 @@ impl Tenant { Ok(()) } + pub fn get_offloaded_timeline( + &self, + timeline_id: TimelineId, + ) -> Result, GetTimelineError> { + self.timelines_offloaded + .lock() + .unwrap() + .get(&timeline_id) + .map(Arc::clone) + .ok_or(GetTimelineError::NotFound { + tenant_id: self.tenant_shard_id, + timeline_id, + }) + } + pub(crate) fn tenant_shard_id(&self) -> TenantShardId { self.tenant_shard_id } @@ -2218,6 +2240,13 @@ impl Tenant { } } + pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool { + let timelines = self.timelines.lock().unwrap(); + !timelines + .iter() + .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id)) + } + pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index fb906d906b..7e6084baaf 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline( return Ok(()); }; + // Now that the Timeline is in Stopping state, request all the related tasks to shut down. + timeline.shutdown(super::ShutdownMode::Hard).await; + // TODO extend guard mechanism above with method // to make deletions possible while offloading is in progress diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index aa4435af4e..18d65cb7de 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -583,6 +583,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter): log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) + def timeline_offload( + self, + tenant_id: Union[TenantId, TenantShardId], + timeline_id: TimelineId, + ): + self.is_testing_enabled_or_skip() + + log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}") + res = self.put( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload", + ) + log.info(f"Got offload request response code: {res.status_code}") + self.verbose_error(res) + res_json = res.json() + assert res_json is None + def timeline_compact( self, tenant_id: Union[TenantId, TenantShardId], diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 841707d32e..971cc57a1c 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -6,6 +6,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException +from fixtures.utils import wait_until @pytest.mark.parametrize("shard_count", [0, 4]) @@ -114,3 +115,86 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): leaf_timeline_id, state=TimelineArchivalState.UNARCHIVED, ) + + +@pytest.mark.parametrize("manual_offload", [False, True]) +def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool): + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + + # Turn off gc and compaction loops: we want to issue them manually for better reliability + tenant_id, initial_timeline_id = env.create_tenant( + conf={ + "gc_period": "0s", + "compaction_period": "0s" if manual_offload else "1s", + } + ) + + # Create two branches and archive them + parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id) + leaf_timeline_id = env.create_branch( + "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" + ) + + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is True + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + + def timeline_offloaded(timeline_id: TimelineId) -> bool: + return ( + env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*") + is not None + ) + + if manual_offload: + with pytest.raises( + PageserverApiException, + match="timeline has attached children", + ): + # This only tests the (made for testing only) http handler, + # but still demonstrates the constraints we have. + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + + def parent_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id) + assert timeline_offloaded(parent_timeline_id) + + def leaf_offloaded(): + if manual_offload: + ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id) + assert timeline_offloaded(leaf_timeline_id) + + wait_until(30, 1, leaf_offloaded) + wait_until(30, 1, parent_offloaded) + + ps_http.timeline_archival_config( + tenant_id, + parent_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + ps_http.timeline_archival_config( + tenant_id, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + leaf_detail = ps_http.timeline_detail( + tenant_id, + leaf_timeline_id, + ) + assert leaf_detail["is_archived"] is False + + assert not timeline_offloaded(initial_timeline_id) From d92d36a315f955cd39bc6f6b0948bae25ed195ad Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 15 Oct 2024 13:13:57 +0100 Subject: [PATCH 26/48] [local_proxy] update api for pg_session_jwt (#9359) pg_session_jwt now: 1. Sets the JWK in a PGU_BACKEND session guc, no longer in the init() function. 2. JWK no longer needs the kid. --- Cargo.lock | 7 +- Cargo.toml | 1 + compute/Dockerfile.compute-node | 4 +- proxy/Cargo.toml | 3 +- proxy/src/serverless/backend.rs | 49 ++++---- proxy/src/serverless/local_conn_pool.rs | 143 ++++++++++++++++-------- workspace_hack/Cargo.toml | 6 +- 7 files changed, 139 insertions(+), 74 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5edf5cf7b4..7e772814ec 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2695,6 +2695,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e" dependencies = [ "equivalent", "hashbrown 0.14.5", + "serde", ] [[package]] @@ -2794,9 +2795,9 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" @@ -4296,6 +4297,7 @@ dependencies = [ "indexmap 2.0.1", "ipnet", "itertools 0.10.5", + "itoa", "jose-jwa", "jose-jwk", "lasso", @@ -7307,6 +7309,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "indexmap 1.9.3", + "indexmap 2.0.1", "itertools 0.12.1", "lazy_static", "libc", diff --git a/Cargo.toml b/Cargo.toml index dde80f5020..a1a974b33b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -107,6 +107,7 @@ indexmap = "2" indoc = "2" ipnet = "2.9.0" itertools = "0.10" +itoa = "1.0.11" jsonwebtoken = "9" lasso = "0.7" libc = "0.2" diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 91528618da..412c64eda4 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -929,8 +929,8 @@ ARG PG_VERSION RUN case "${PG_VERSION}" in "v17") \ echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \ esac && \ - wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \ - echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \ + wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \ + echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index 963fb94a7d..e25d2fcbab 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -42,9 +42,10 @@ hyper0.workspace = true hyper = { workspace = true, features = ["server", "http1", "http2"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } -indexmap.workspace = true +indexmap = { workspace = true, features = ["serde"] } ipnet.workspace = true itertools.workspace = true +itoa.workspace = true lasso = { workspace = true, features = ["multi-threaded"] } measured = { workspace = true, features = ["lasso"] } metrics.workspace = true diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 2b060af9e1..927854897f 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -2,8 +2,9 @@ use std::{io, sync::Arc, time::Duration}; use async_trait::async_trait; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; +use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey}; +use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; -use tokio_postgres::types::ToSql; use tracing::{debug, field::display, info}; use crate::{ @@ -267,50 +268,58 @@ impl PoolingBackend { auth::Backend::Local(local) => local.node_info.clone(), }; + let (key, jwk) = create_random_jwk(); + let config = node_info .config .user(&conn_info.user_info.user) - .dbname(&conn_info.dbname); + .dbname(&conn_info.dbname) + .options(&format!( + "-c pg_session_jwt.jwk={}", + serde_json::to_string(&jwk).expect("serializing jwk to json should not fail") + )); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); let (client, connection) = config.connect(tokio_postgres::NoTls).await?; drop(pause); - tracing::Span::current().record("pid", tracing::field::display(client.get_process_id())); + let pid = client.get_process_id(); + tracing::Span::current().record("pid", pid); - let handle = local_conn_pool::poll_client( + let mut handle = local_conn_pool::poll_client( self.local_pool.clone(), ctx, conn_info, client, connection, + key, conn_id, node_info.aux.clone(), ); - let kid = handle.get_client().get_process_id() as i64; - let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk(); + { + let (client, mut discard) = handle.inner(); + debug!("setting up backend session state"); - debug!(kid, ?jwk, "setting up backend session state"); + // initiates the auth session + if let Err(e) = client.query("select auth.init()", &[]).await { + discard.discard(); + return Err(e.into()); + } - // initiates the auth session - handle - .get_client() - .query( - "select auth.init($1, $2);", - &[ - &kid as &(dyn ToSql + Sync), - &tokio_postgres::types::Json(jwk), - ], - ) - .await?; - - info!(?kid, "backend session state init"); + info!("backend session state initialized"); + } Ok(handle) } } +fn create_random_jwk() -> (SigningKey, JwkEcKey) { + let key = SigningKey::random(&mut OsRng); + let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk(); + (key, jwk) +} + #[derive(Debug, thiserror::Error)] pub(crate) enum HttpConnError { #[error("pooled connection closed at inconsistent state")] diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 1dde5952e1..4ab14ad35f 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -1,9 +1,9 @@ use futures::{future::poll_fn, Future}; +use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; -use rand::rngs::OsRng; -use serde_json::Value; +use serde_json::value::RawValue; use signature::Signer; use std::task::{ready, Poll}; use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; @@ -12,14 +12,13 @@ use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; -use typed_json::json; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::Metrics; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::{context::RequestMonitoring, DbName, RoleName}; -use tracing::{debug, error, warn, Span}; +use tracing::{error, warn, Span}; use tracing::{info, info_span, Instrument}; use super::backend::HttpConnError; @@ -245,12 +244,14 @@ impl LocalConnPool { } } +#[allow(clippy::too_many_arguments)] pub(crate) fn poll_client( global_pool: Arc>, ctx: &RequestMonitoring, conn_info: ConnInfo, client: tokio_postgres::Client, mut connection: tokio_postgres::Connection, + key: SigningKey, conn_id: uuid::Uuid, aux: MetricsAuxInfo, ) -> LocalClient { @@ -346,8 +347,6 @@ pub(crate) fn poll_client( } .instrument(span)); - let key = SigningKey::random(&mut OsRng); - let inner = ClientInner { inner: client, session: tx, @@ -430,13 +429,6 @@ impl LocalClient { let inner = inner.as_mut().expect("client inner should not be removed"); (&mut inner.inner, Discard { conn_info, pool }) } - pub(crate) fn key(&self) -> &SigningKey { - let inner = &self - .inner - .as_ref() - .expect("client inner should not be removed"); - &inner.key - } } impl LocalClient { @@ -445,25 +437,9 @@ impl LocalClient { .inner .as_mut() .expect("client inner should not be removed"); + inner.jti += 1; - - let kid = inner.inner.get_process_id(); - let header = json!({"kid":kid}).to_string(); - - let mut payload = serde_json::from_slice::>(payload) - .map_err(HttpConnError::JwtPayloadError)?; - payload.insert("jti".to_string(), Value::Number(inner.jti.into())); - let payload = Value::Object(payload).to_string(); - - debug!( - kid, - jti = inner.jti, - ?header, - ?payload, - "signing new ephemeral JWT" - ); - - let token = sign_jwt(&inner.key, header, payload); + let token = resign_jwt(&inner.key, payload, inner.jti)?; // initiates the auth session inner.inner.simple_query("discard all").await?; @@ -475,20 +451,74 @@ impl LocalClient { ) .await?; - info!(kid, jti = inner.jti, "user session state init"); + let pid = inner.inner.get_process_id(); + info!(pid, jti = inner.jti, "user session state init"); Ok(()) } } -fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String { - let header = Base64UrlUnpadded::encode_string(header.as_bytes()); - let payload = Base64UrlUnpadded::encode_string(payload.as_bytes()); +/// implements relatively efficient in-place json object key upserting +/// +/// only supports top-level keys +fn upsert_json_object( + payload: &[u8], + key: &str, + value: &RawValue, +) -> Result { + let mut payload = serde_json::from_slice::>(payload)?; + payload.insert(key, value); + serde_json::to_string(&payload) +} - let message = format!("{header}.{payload}"); - let sig: Signature = sk.sign(message.as_bytes()); - let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes()); - format!("{message}.{base64_sig}") +fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result { + let mut buffer = itoa::Buffer::new(); + + // encode the jti integer to a json rawvalue + let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap(); + + // update the jti in-place + let payload = + upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?; + + // sign the jwt + let token = sign_jwt(sk, payload.as_bytes()); + + Ok(token) +} + +fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String { + let header_len = 20; + let payload_len = Base64UrlUnpadded::encoded_len(payload); + let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]); + let total_len = header_len + payload_len + signature_len + 2; + + let mut jwt = String::with_capacity(total_len); + let cap = jwt.capacity(); + + // we only need an empty header with the alg specified. + // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9" + jwt.push_str("eyJhbGciOiJFUzI1NiJ9."); + + // encode the jwt payload in-place + base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt); + + // create the signature from the encoded header || payload + let sig: Signature = sk.sign(jwt.as_bytes()); + + jwt.push('.'); + + // encode the jwt signature in-place + base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt); + + debug_assert_eq!( + jwt.len(), + total_len, + "the jwt len should match our expected len" + ); + debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change"); + + jwt } impl Discard<'_, C> { @@ -509,14 +539,6 @@ impl Discard<'_, C> { } impl LocalClient { - pub fn get_client(&self) -> &C { - &self - .inner - .as_ref() - .expect("client inner should not be removed") - .inner - } - fn do_drop(&mut self) -> Option { let conn_info = self.conn_info.clone(); let client = self @@ -542,3 +564,30 @@ impl Drop for LocalClient { } } } + +#[cfg(test)] +mod tests { + use p256::ecdsa::SigningKey; + use typed_json::json; + + use super::resign_jwt; + + #[test] + fn jwt_token_snapshot() { + let key = SigningKey::from_bytes(&[1; 32].into()).unwrap(); + let data = + json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string(); + + let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap(); + + // To validate the JWT, copy the JWT string and paste it into https://jwt.io/. + // In the public-key box, paste the following jwk public key + // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}` + + // let pub_key = p256::ecdsa::VerifyingKey::from(&key); + // let pub_key = p256::PublicKey::from(pub_key); + // println!("{}", pub_key.to_jwk_string()); + + assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA"); + } +} diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 0a90b6b6f7..1347d6ddff 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -46,7 +46,8 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] } hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] } hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] } -indexmap = { version = "1", default-features = false, features = ["std"] } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] } libc = { version = "0.2", features = ["extra_traits", "use_std"] } @@ -101,7 +102,8 @@ either = { version = "1" } getrandom = { version = "0.2", default-features = false, features = ["std"] } half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } -indexmap = { version = "1", default-features = false, features = ["std"] } +indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] } +indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] } itertools = { version = "0.12" } libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } From fb74c21e8cae23831b7728232772315297463e63 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Tue, 15 Oct 2024 15:24:56 +0200 Subject: [PATCH 27/48] proxy: Migrate jwt module away from anyhow (#9361) --- proxy/src/auth/backend/jwt.rs | 188 +++++++++++++++++------ proxy/src/auth/backend/local.rs | 6 +- proxy/src/auth/backend/mod.rs | 3 +- proxy/src/control_plane/provider/mock.rs | 10 +- proxy/src/control_plane/provider/mod.rs | 43 +++++- proxy/src/control_plane/provider/neon.rs | 27 ++-- proxy/src/proxy/tests/mod.rs | 42 ++--- proxy/src/proxy/wake_compute.rs | 2 +- 8 files changed, 228 insertions(+), 93 deletions(-) diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 17ab7eda22..402e59fdb3 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -4,21 +4,20 @@ use std::{ time::{Duration, SystemTime}, }; -use anyhow::{bail, ensure, Context}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; use serde::{de::Visitor, Deserialize, Deserializer}; use signature::Verifier; +use thiserror::Error; use tokio::time::Instant; use crate::{ - context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId, - RoleName, + auth::backend::ComputeCredentialKeys, context::RequestMonitoring, + control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit, + intern::RoleNameInt, EndpointId, RoleName, }; -use super::ComputeCredentialKeys; - // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); const MIN_RENEW: Duration = Duration::from_secs(30); @@ -32,7 +31,16 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> impl Future>> + Send; + ) -> impl Future, FetchAuthRulesError>> + Send; +} + +#[derive(Error, Debug)] +pub(crate) enum FetchAuthRulesError { + #[error(transparent)] + GetEndpointJwks(#[from] GetEndpointJwksError), + + #[error("JWKs settings for this role were not configured")] + RoleJwksNotConfigured, } pub(crate) struct AuthRule { @@ -122,7 +130,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, auth_rules: &F, - ) -> anyhow::Result> { + ) -> Result, JwtError> { // double check that no one beat us to updating the cache. let now = Instant::now(); let guard = self.cached.load_full(); @@ -188,7 +196,7 @@ impl JwkCacheEntryLock { client: &reqwest::Client, endpoint: EndpointId, fetch: &F, - ) -> Result, anyhow::Error> { + ) -> Result, JwtError> { let now = Instant::now(); let guard = self.cached.load_full(); @@ -243,27 +251,24 @@ impl JwkCacheEntryLock { endpoint: EndpointId, role_name: &RoleName, fetch: &F, - ) -> Result { + ) -> Result { // JWT compact form is defined to be // || . || || . || // where Signature = alg( || . || ); let (header_payload, signature) = jwt .rsplit_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; let (header, payload) = header_payload .split_once('.') - .context("Provided authentication token is not a valid JWT encoding")?; + .ok_or(JwtEncodingError::InvalidCompactForm)?; - let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let header = serde_json::from_slice::>(&header) - .context("Provided authentication token is not a valid JWT encoding")?; + let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?; + let header = serde_json::from_slice::>(&header)?; - let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; + let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?; - let kid = header.key_id.context("missing key id")?; + let kid = header.key_id.ok_or(JwtError::MissingKeyId)?; let mut guard = self .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch) @@ -281,16 +286,13 @@ impl JwkCacheEntryLock { .renew_jwks(permit, ctx, client, endpoint.clone(), fetch) .await?; } - _ => { - bail!("jwk not found"); - } + _ => return Err(JwtError::JwkNotFound), } }; - ensure!( - jwk.is_supported(&header.algorithm), - "signature algorithm not supported" - ); + if !jwk.is_supported(&header.algorithm) { + return Err(JwtError::SignatureAlgorithmNotSupported); + } match &jwk.key { jose_jwk::Key::Ec(key) => { @@ -299,34 +301,32 @@ impl JwkCacheEntryLock { jose_jwk::Key::Rsa(key) => { verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?; } - key => bail!("unsupported key type {key:?}"), + key => return Err(JwtError::UnsupportedKeyType(key.into())), }; - let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD) - .context("Provided authentication token is not a valid JWT encoding")?; - let payload = serde_json::from_slice::>(&payloadb) - .context("Provided authentication token is not a valid JWT encoding")?; + let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?; + let payload = serde_json::from_slice::>(&payloadb)?; tracing::debug!(?payload, "JWT signature valid with claims"); if let Some(aud) = expected_audience { - ensure!( - payload.audience.0.iter().any(|s| s == aud), - "invalid JWT token audience" - ); + if payload.audience.0.iter().all(|s| s != aud) { + return Err(JwtError::InvalidJwtTokenAudience); + } } let now = SystemTime::now(); if let Some(exp) = payload.expiration { - ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired"); + if now >= exp + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenHasExpired); + } } if let Some(nbf) = payload.not_before { - ensure!( - nbf < now + CLOCK_SKEW_LEEWAY, - "JWT token is not yet ready to use" - ); + if nbf >= now + CLOCK_SKEW_LEEWAY { + return Err(JwtError::JwtTokenNotYetReadyToUse); + } } Ok(ComputeCredentialKeys::JwtPayload(payloadb)) @@ -341,7 +341,7 @@ impl JwkCache { role_name: &RoleName, fetch: &F, jwt: &str, - ) -> Result { + ) -> Result { // try with just a read lock first let key = (endpoint.clone(), role_name.clone()); let entry = self.map.get(&key).as_deref().map(Arc::clone); @@ -357,19 +357,18 @@ impl JwkCache { } } -fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> { +fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> { use ecdsa::Signature; use signature::Verifier; match key.crv { jose_jwk::EcCurves::P256 => { - let pk = - p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?; + let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?; let key = p256::ecdsa::VerifyingKey::from(&pk); let sig = Signature::from_slice(sig)?; key.verify(data, &sig)?; } - key => bail!("unsupported ec key type {key:?}"), + key => return Err(JwtError::UnsupportedEcKeyType(key)), } Ok(()) @@ -380,14 +379,14 @@ fn verify_rsa_signature( sig: &[u8], key: &jose_jwk::Rsa, alg: &jose_jwa::Algorithm, -) -> anyhow::Result<()> { +) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; use rsa::{ pkcs1v15::{Signature, VerifyingKey}, RsaPublicKey, }; - let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?; + let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; match alg { Algorithm::Signing(Signing::Rs256) => { @@ -395,7 +394,7 @@ fn verify_rsa_signature( let sig = Signature::try_from(sig)?; key.verify(data, &sig)?; } - _ => bail!("invalid RSA signing algorithm"), + _ => return Err(JwtError::InvalidRsaSigningAlgorithm), }; Ok(()) @@ -561,6 +560,99 @@ impl Drop for JwkRenewalPermit<'_> { } } +#[derive(Error, Debug)] +#[non_exhaustive] +pub(crate) enum JwtError { + #[error("jwk not found")] + JwkNotFound, + + #[error("missing key id")] + MissingKeyId, + + #[error("Provided authentication token is not a valid JWT encoding")] + JwtEncoding(#[from] JwtEncodingError), + + #[error("invalid JWT token audience")] + InvalidJwtTokenAudience, + + #[error("JWT token has expired")] + JwtTokenHasExpired, + + #[error("JWT token is not yet ready to use")] + JwtTokenNotYetReadyToUse, + + #[error("invalid P256 key")] + InvalidP256Key(jose_jwk::crypto::Error), + + #[error("invalid RSA key")] + InvalidRsaKey(jose_jwk::crypto::Error), + + #[error("invalid RSA signing algorithm")] + InvalidRsaSigningAlgorithm, + + #[error("unsupported EC key type {0:?}")] + UnsupportedEcKeyType(jose_jwk::EcCurves), + + #[error("unsupported key type {0:?}")] + UnsupportedKeyType(KeyType), + + #[error("signature algorithm not supported")] + SignatureAlgorithmNotSupported, + + #[error("signature error: {0}")] + Signature(#[from] signature::Error), + + #[error("failed to fetch auth rules: {0}")] + FetchAuthRules(#[from] FetchAuthRulesError), +} + +impl From for JwtError { + fn from(err: base64::DecodeError) -> Self { + JwtEncodingError::Base64Decode(err).into() + } +} + +impl From for JwtError { + fn from(err: serde_json::Error) -> Self { + JwtEncodingError::SerdeJson(err).into() + } +} + +#[derive(Error, Debug)] +#[non_exhaustive] +pub enum JwtEncodingError { + #[error(transparent)] + Base64Decode(#[from] base64::DecodeError), + + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + + #[error("invalid compact form")] + InvalidCompactForm, +} + +#[allow(dead_code, reason = "Debug use only")] +#[derive(Debug)] +pub(crate) enum KeyType { + Ec(jose_jwk::EcCurves), + Rsa, + Oct, + Okp(jose_jwk::OkpCurves), + Unknown, +} + +impl From<&jose_jwk::Key> for KeyType { + fn from(key: &jose_jwk::Key) -> Self { + match key { + jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv), + jose_jwk::Key::Rsa(_rsa) => Self::Rsa, + jose_jwk::Key::Oct(_oct) => Self::Oct, + jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv), + _ => Self::Unknown, + } + } +} + #[cfg(test)] mod tests { use crate::RoleName; @@ -758,7 +850,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { Ok(vec![ AuthRule { id: "foo".to_owned(), diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 12451847b1..1dea4d2d73 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -1,9 +1,9 @@ use std::net::SocketAddr; -use anyhow::Context; use arc_swap::ArcSwapOption; use crate::{ + auth::backend::jwt::FetchAuthRulesError, compute::ConnCfg, context::RequestMonitoring, control_plane::{ @@ -53,11 +53,11 @@ impl FetchAuthRules for StaticAuthRules { &self, _ctx: &RequestMonitoring, _endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, FetchAuthRulesError> { let mappings = JWKS_ROLE_MAP.load(); let role_mappings = mappings .as_deref() - .context("JWKs settings for this role were not configured")?; + .ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?; let mut rules = vec![]; for setting in &role_mappings.jwks { rules.push(AuthRule { diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 96e1a787ed..7cf158bcd9 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -561,7 +561,8 @@ mod tests { &self, _ctx: &RequestMonitoring, _endpoint: crate::EndpointId, - ) -> anyhow::Result> { + ) -> Result, control_plane::errors::GetEndpointJwksError> + { unimplemented!() } diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs index ea2eb79e2a..51cddec672 100644 --- a/proxy/src/control_plane/provider/mock.rs +++ b/proxy/src/control_plane/provider/mock.rs @@ -5,7 +5,8 @@ use super::{ AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, }; use crate::{ - auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName, + auth::backend::jwt::AuthRule, context::RequestMonitoring, + control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName, }; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::IpPattern, cache::Cached}; @@ -120,7 +121,10 @@ impl Api { }) } - async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result> { + async fn do_get_endpoint_jwks( + &self, + endpoint: EndpointId, + ) -> Result, GetEndpointJwksError> { let (client, connection) = tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?; @@ -224,7 +228,7 @@ impl super::Api for Api { &self, _ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(endpoint).await } diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index 6cc525a324..0a196fe2a3 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -6,7 +6,7 @@ use super::messages::{ControlPlaneError, MetricsAuxInfo}; use crate::{ auth::{ backend::{ - jwt::{AuthRule, FetchAuthRules}, + jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}, ComputeCredentialKeys, ComputeUserInfo, }, IpPattern, @@ -44,7 +44,7 @@ pub(crate) mod errors { pub(crate) enum ApiError { /// Error returned by the console itself. #[error("{REQUEST_FAILED} with {0}")] - ControlPlane(ControlPlaneError), + ControlPlane(Box), /// Various IO errors like broken pipe or malformed payload. #[error("{REQUEST_FAILED}: {0}")] @@ -90,7 +90,7 @@ pub(crate) mod errors { Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane, Reason::LockAlreadyTaken => ErrorKind::ControlPlane, Reason::RunningOperations => ErrorKind::ControlPlane, - Reason::Unknown => match &e { + Reason::Unknown => match &**e { ControlPlaneError { http_status_code: http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE, @@ -246,6 +246,33 @@ pub(crate) mod errors { } } } + + #[derive(Debug, Error)] + pub enum GetEndpointJwksError { + #[error("endpoint not found")] + EndpointNotFound, + + #[error("failed to build control plane request: {0}")] + RequestBuild(#[source] reqwest::Error), + + #[error("failed to send control plane request: {0}")] + RequestExecute(#[source] reqwest_middleware::Error), + + #[error(transparent)] + ControlPlane(#[from] ApiError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TokioPostgres(#[from] tokio_postgres::Error), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + ParseUrl(#[from] url::ParseError), + + #[cfg(any(test, feature = "testing"))] + #[error(transparent)] + TaskJoin(#[from] tokio::task::JoinError), + } } /// Auth secret which is managed by the cloud. @@ -342,7 +369,7 @@ pub(crate) trait Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result>; + ) -> Result, errors::GetEndpointJwksError>; /// Wake up the compute node and return the corresponding connection info. async fn wake_compute( @@ -401,7 +428,7 @@ impl Api for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, errors::GetEndpointJwksError> { match self { Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await, #[cfg(any(test, feature = "testing"))] @@ -583,7 +610,9 @@ impl FetchAuthRules for ControlPlaneBackend { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { - self.get_endpoint_jwks(ctx, endpoint).await + ) -> Result, FetchAuthRulesError> { + self.get_endpoint_jwks(ctx, endpoint) + .await + .map_err(FetchAuthRulesError::GetEndpointJwks) } } diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index d01878741c..2487ce0e3f 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -9,7 +9,10 @@ use super::{ use crate::{ auth::backend::{jwt::AuthRule, ComputeUserInfo}, compute, - control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}, + control_plane::{ + errors::GetEndpointJwksError, + messages::{ColdStartInfo, EndpointJwksResponse, Reason}, + }, http, metrics::{CacheOutcome, Metrics}, rate_limiter::WakeComputeRateLimiter, @@ -17,7 +20,6 @@ use crate::{ }; use crate::{cache::Cached, context::RequestMonitoring}; use ::http::{header::AUTHORIZATION, HeaderName}; -use anyhow::bail; use futures::TryFutureExt; use std::{sync::Arc, time::Duration}; use tokio::time::Instant; @@ -137,14 +139,14 @@ impl Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { if !self .caches .endpoints_cache .is_valid(ctx, &endpoint.normalize()) .await { - bail!("endpoint not found"); + return Err(GetEndpointJwksError::EndpointNotFound); } let request_id = ctx.session_id().to_string(); async { @@ -159,12 +161,17 @@ impl Api { .header(X_REQUEST_ID, &request_id) .header(AUTHORIZATION, format!("Bearer {}", &self.jwt)) .query(&[("session_id", ctx.session_id())]) - .build()?; + .build() + .map_err(GetEndpointJwksError::RequestBuild)?; info!(url = request.url().as_str(), "sending http request"); let start = Instant::now(); let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane); - let response = self.endpoint.execute(request).await?; + let response = self + .endpoint + .execute(request) + .await + .map_err(GetEndpointJwksError::RequestExecute)?; drop(pause); info!(duration = ?start.elapsed(), "received http response"); @@ -330,7 +337,7 @@ impl super::Api for Api { &self, ctx: &RequestMonitoring, endpoint: EndpointId, - ) -> anyhow::Result> { + ) -> Result, GetEndpointJwksError> { self.do_get_endpoint_jwks(ctx, endpoint).await } @@ -348,7 +355,7 @@ impl super::Api for Api { let (cached, info) = cached.take_value(); let info = info.map_err(|c| { info!(key = &*key, "found cached wake_compute error"); - WakeComputeError::ApiError(ApiError::ControlPlane(*c)) + WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c))) })?; debug!(key = &*key, "found cached compute node info"); @@ -418,7 +425,7 @@ impl super::Api for Api { self.caches.node_info.insert_ttl( key, - Err(Box::new(err.clone())), + Err(err.clone()), Duration::from_secs(30), ); @@ -457,7 +464,7 @@ async fn parse_body serde::Deserialize<'a>>( body.http_status_code = status; warn!("console responded with an error ({status}): {body:?}"); - Err(ApiError::ControlPlane(body)) + Err(ApiError::ControlPlane(Box::new(body))) } fn parse_host_port(input: &str) -> Option<(&str, u16)> { diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 58fb36dba7..deb4d4a63f 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -492,30 +492,32 @@ impl TestBackend for TestConnectMechanism { match action { ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)), ConnectAction::WakeFail => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: None, - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: None, + })); assert!(!err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } ConnectAction::WakeRetry => { - let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError { - http_status_code: StatusCode::BAD_REQUEST, - error: "TEST".into(), - status: Some(Status { - code: "error".into(), - message: "error".into(), - details: Details { - error_info: None, - retry_info: Some(control_plane::messages::RetryInfo { - retry_delay_ms: 1, - }), - user_facing_message: None, - }, - }), - }); + let err = + control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError { + http_status_code: StatusCode::BAD_REQUEST, + error: "TEST".into(), + status: Some(Status { + code: "error".into(), + message: "error".into(), + details: Details { + error_info: None, + retry_info: Some(control_plane::messages::RetryInfo { + retry_delay_ms: 1, + }), + user_facing_message: None, + }, + }), + })); assert!(err.could_retry()); Err(control_plane::errors::WakeComputeError::ApiError(err)) } diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index ba674f5d0d..0d1527a2c1 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -79,7 +79,7 @@ fn report_error(e: &WakeComputeError, retry: bool) { Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked, Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked, Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked, - Reason::Unknown => match e { + Reason::Unknown => match **e { ControlPlaneError { http_status_code: StatusCode::LOCKED, ref error, From 614c3aef72ed595190801e8d77fe188e3cb13605 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 15 Oct 2024 17:18:52 +0300 Subject: [PATCH 28/48] Remove redundant code (#9373) ## Problem There is double update of resize cache in `put_rel_truncation` Also `page_server_request` contains check that fork is MAIN_FORKNUM which 1. is incorrect (because Vm/FSM pages are shreded in the same way as MAIN fork pages and 2. is redundant because `page_server_request` is never called for `get page` request so first part to OR condition is always true. ## Summary of changes Remove redundant code ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist --------- Co-authored-by: Konstantin Knizhnik --- pageserver/src/pgdatadir_mapping.rs | 3 --- pgxn/neon/pagestore_smgr.c | 3 +-- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 7aa313f031..900da5beab 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1545,9 +1545,6 @@ impl<'a> DatadirModification<'a> { // Update relation size cache self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update relation size cache - self.tline.set_cached_rel_size(rel, self.lsn, nblocks); - // Update logical database size. self.pending_nblocks -= old_size as i64 - nblocks as i64; } diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index f46df7f70a..cbb0e2ae6d 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -1092,8 +1092,7 @@ page_server_request(void const *req) * Current sharding model assumes that all metadata is present only at shard 0. * We still need to call get_shard_no() to check if shard map is up-to-date. */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || - ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM) + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) { shard_no = 0; } From cf7a596a151487c1b3afafbe1eb2efab895326ea Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 15 Oct 2024 11:18:38 -0500 Subject: [PATCH 29/48] Generate sql_exporter config files with Jsonnet There are quite a few benefits to this approach: - Reduce config duplication - The two sql_exporter configs were super similar with just a few differences - Pull SQL queries into standalone files - That means we could run a SQL formatter on the file in the future - It also means access to syntax highlighting - In the future, run different queries for different PG versions - This is relevant because right now, we have queries that are failing on PG 17 due to catalog updates Signed-off-by: Tristan Partin --- .github/workflows/build_and_test.yml | 19 + Dockerfile.build-tools | 1 + Makefile | 1 + compute/.gitignore | 5 + compute/Dockerfile.compute-node | 22 +- compute/Makefile | 35 ++ compute/etc/README.md | 17 + compute/etc/neon_collector.jsonnet | 43 +++ compute/etc/neon_collector.yml | 331 ------------------ .../etc/neon_collector_autoscaling.jsonnet | 11 + compute/etc/neon_collector_autoscaling.yml | 55 --- compute/etc/sql_exporter.jsonnet | 40 +++ compute/etc/sql_exporter.yml | 33 -- .../sql_exporter/checkpoints_req.libsonnet | 10 + compute/etc/sql_exporter/checkpoints_req.sql | 1 + .../sql_exporter/checkpoints_timed.libsonnet | 10 + .../etc/sql_exporter/checkpoints_timed.sql | 1 + .../compute_current_lsn.libsonnet | 10 + .../etc/sql_exporter/compute_current_lsn.sql | 4 + .../compute_logical_snapshot_files.libsonnet | 12 + .../compute_logical_snapshot_files.sql | 7 + .../compute_receive_lsn.libsonnet | 10 + .../etc/sql_exporter/compute_receive_lsn.sql | 4 + .../compute_subscriptions_count.libsonnet | 12 + .../compute_subscriptions_count.sql | 1 + .../sql_exporter/connection_counts.libsonnet | 13 + .../etc/sql_exporter/connection_counts.sql | 1 + .../etc/sql_exporter/db_total_size.libsonnet | 10 + compute/etc/sql_exporter/db_total_size.sql | 1 + .../getpage_prefetch_discards_total.libsonnet | 9 + .../getpage_prefetch_misses_total.libsonnet | 9 + .../getpage_prefetch_requests_total.libsonnet | 9 + .../getpage_sync_requests_total.libsonnet | 9 + .../getpage_wait_seconds_bucket.libsonnet | 12 + .../getpage_wait_seconds_bucket.sql | 1 + .../getpage_wait_seconds_count.libsonnet | 9 + .../getpage_wait_seconds_sum.libsonnet | 9 + ...lfc_approximate_working_set_size.libsonnet | 12 + .../lfc_approximate_working_set_size.sql | 1 + ...ing_set_size_windows.autoscaling.libsonnet | 12 + ...e_working_set_size_windows.autoscaling.sql | 8 + ...oximate_working_set_size_windows.libsonnet | 12 + ...c_approximate_working_set_size_windows.sql | 8 + .../lfc_cache_size_limit.libsonnet | 10 + .../etc/sql_exporter/lfc_cache_size_limit.sql | 1 + compute/etc/sql_exporter/lfc_hits.libsonnet | 10 + compute/etc/sql_exporter/lfc_hits.sql | 1 + compute/etc/sql_exporter/lfc_misses.libsonnet | 10 + compute/etc/sql_exporter/lfc_misses.sql | 1 + compute/etc/sql_exporter/lfc_used.libsonnet | 10 + compute/etc/sql_exporter/lfc_used.sql | 1 + compute/etc/sql_exporter/lfc_writes.libsonnet | 10 + compute/etc/sql_exporter/lfc_writes.sql | 1 + .../logical_slot_restart_lsn.libsonnet | 15 + .../sql_exporter/logical_slot_restart_lsn.sql | 3 + .../sql_exporter/max_cluster_size.libsonnet | 10 + compute/etc/sql_exporter/max_cluster_size.sql | 1 + .../etc/sql_exporter/neon_perf_counters.sql | 13 + .../pageserver_disconnects_total.libsonnet | 9 + .../pageserver_requests_sent_total.libsonnet | 9 + .../pageserver_send_flushes_total.libsonnet | 9 + .../sql_exporter/pg_stats_userdb.libsonnet | 18 + compute/etc/sql_exporter/pg_stats_userdb.sql | 10 + .../replication_delay_bytes.libsonnet | 10 + .../sql_exporter/replication_delay_bytes.sql | 6 + .../replication_delay_seconds.libsonnet | 10 + .../replication_delay_seconds.sql | 5 + .../etc/sql_exporter/retained_wal.libsonnet | 12 + compute/etc/sql_exporter/retained_wal.sql | 5 + .../etc/sql_exporter/wal_is_lost.libsonnet | 12 + compute/etc/sql_exporter/wal_is_lost.sql | 7 + compute/etc/sql_exporter_autoscaling.yml | 33 -- 72 files changed, 635 insertions(+), 457 deletions(-) create mode 100644 compute/.gitignore create mode 100644 compute/Makefile create mode 100644 compute/etc/README.md create mode 100644 compute/etc/neon_collector.jsonnet delete mode 100644 compute/etc/neon_collector.yml create mode 100644 compute/etc/neon_collector_autoscaling.jsonnet delete mode 100644 compute/etc/neon_collector_autoscaling.yml create mode 100644 compute/etc/sql_exporter.jsonnet delete mode 100644 compute/etc/sql_exporter.yml create mode 100644 compute/etc/sql_exporter/checkpoints_req.libsonnet create mode 100644 compute/etc/sql_exporter/checkpoints_req.sql create mode 100644 compute/etc/sql_exporter/checkpoints_timed.libsonnet create mode 100644 compute/etc/sql_exporter/checkpoints_timed.sql create mode 100644 compute/etc/sql_exporter/compute_current_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/compute_current_lsn.sql create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.sql create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.sql create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.libsonnet create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.sql create mode 100644 compute/etc/sql_exporter/connection_counts.libsonnet create mode 100644 compute/etc/sql_exporter/connection_counts.sql create mode 100644 compute/etc/sql_exporter/db_total_size.libsonnet create mode 100644 compute/etc/sql_exporter/db_total_size.sql create mode 100644 compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.sql create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.sql create mode 100644 compute/etc/sql_exporter/lfc_hits.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_hits.sql create mode 100644 compute/etc/sql_exporter/lfc_misses.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_misses.sql create mode 100644 compute/etc/sql_exporter/lfc_used.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_used.sql create mode 100644 compute/etc/sql_exporter/lfc_writes.libsonnet create mode 100644 compute/etc/sql_exporter/lfc_writes.sql create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.sql create mode 100644 compute/etc/sql_exporter/max_cluster_size.libsonnet create mode 100644 compute/etc/sql_exporter/max_cluster_size.sql create mode 100644 compute/etc/sql_exporter/neon_perf_counters.sql create mode 100644 compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.libsonnet create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.sql create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.libsonnet create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.sql create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.libsonnet create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.sql create mode 100644 compute/etc/sql_exporter/retained_wal.libsonnet create mode 100644 compute/etc/sql_exporter/retained_wal.sql create mode 100644 compute/etc/sql_exporter/wal_is_lost.libsonnet create mode 100644 compute/etc/sql_exporter/wal_is_lost.sql delete mode 100644 compute/etc/sql_exporter_autoscaling.yml diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 51f6975e63..c9a447626f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -120,6 +120,25 @@ jobs: - name: Run mypy to check types run: poetry run mypy . + check-codestyle-jsonnet: + needs: [ check-permissions, build-build-tools-image ] + runs-on: [ self-hosted, small ] + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Check Jsonnet code formatting + run: | + jsonnetfmt --test \ + $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet') + # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. check-submodules: diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 54e9134257..7cba1c8635 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -27,6 +27,7 @@ RUN set -e \ gnupg \ gzip \ jq \ + jsonnet \ libcurl4-openssl-dev \ libbz2-dev \ libffi-dev \ diff --git a/Makefile b/Makefile index 5e227ed3f5..33cfda2661 100644 --- a/Makefile +++ b/Makefile @@ -291,6 +291,7 @@ postgres-check: \ # This doesn't remove the effects of 'configure'. .PHONY: clean clean: postgres-clean neon-pg-clean-ext + $(MAKE) -C compute clean $(CARGO_CMD_PREFIX) cargo clean # This removes everything diff --git a/compute/.gitignore b/compute/.gitignore new file mode 100644 index 0000000000..70980d335a --- /dev/null +++ b/compute/.gitignore @@ -0,0 +1,5 @@ +# sql_exporter config files generated from Jsonnet +etc/neon_collector.yml +etc/neon_collector_autoscaling.yml +etc/sql_exporter.yml +etc/sql_exporter_autoscaling.yml diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 412c64eda4..13381b2901 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -349,7 +349,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ # not version-specific -# doesn't use releases, last commit f3d82fd - Mar 2, 2023 +# doesn't use releases, last commit f3d82fd - Mar 2, 2023 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \ echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \ mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \ @@ -1169,6 +1169,18 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a +######################################################################################### +# +# Preprocess the sql_exporter configuration files +# +######################################################################################### +FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor + +USER nonroot + +COPY --chown=nonroot compute compute + +RUN make -C compute ######################################################################################### # @@ -1287,10 +1299,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter -COPY --chmod=0644 compute/etc/sql_exporter.yml /etc/sql_exporter.yml -COPY --chmod=0644 compute/etc/neon_collector.yml /etc/neon_collector.yml -COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml -COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml +COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Create remote extension download directory RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions diff --git a/compute/Makefile b/compute/Makefile new file mode 100644 index 0000000000..45fbfa6d5e --- /dev/null +++ b/compute/Makefile @@ -0,0 +1,35 @@ +jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet) + +.PHONY: all +all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml + +neon_collector.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + etc/neon_collector.jsonnet + +neon_collector_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + etc/neon_collector_autoscaling.jsonnet + +sql_exporter.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector.yml \ + etc/sql_exporter.jsonnet + +sql_exporter_autoscaling.yml: $(jsonnet_files) + JSONNET_PATH=etc jsonnet \ + --output-file etc/$@ \ + --tla-str collector_file=neon_collector_autoscaling.yml \ + --tla-str application_name=sql_exporter_autoscaling \ + etc/sql_exporter.jsonnet + +.PHONY: clean +clean: + rm --force \ + etc/neon_collector.yml \ + etc/neon_collector_autoscaling.yml \ + etc/sql_exporter.yml \ + etc/sql_exporter_autoscaling.yml diff --git a/compute/etc/README.md b/compute/etc/README.md new file mode 100644 index 0000000000..70b108146c --- /dev/null +++ b/compute/etc/README.md @@ -0,0 +1,17 @@ +# Compute Configuration + +These files are the configuration files for various other pieces of software +that will be running in the compute alongside Postgres. + +## `sql_exporter` + +### Adding a `sql_exporter` Metric + +We use `sql_exporter` to export various metrics from Postgres. In order to add +a metric, you will need to create two files: a `libsonnet` and a `sql` file. You +will then import the `libsonnet` file in one of the collector files, and the +`sql` file will be imported in the `libsonnet` file. + +In the event your statistic is an LSN, you may want to cast it to a `float8` +because Prometheus only supports floats. It's probably fine because `float8` can +store integers from `-2^53` to `+2^53` exactly. diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet new file mode 100644 index 0000000000..2031eb8c85 --- /dev/null +++ b/compute/etc/neon_collector.jsonnet @@ -0,0 +1,43 @@ +{ + collector_name: 'neon_collector', + metrics: [ + import 'sql_exporter/checkpoints_req.libsonnet', + import 'sql_exporter/checkpoints_timed.libsonnet', + import 'sql_exporter/compute_current_lsn.libsonnet', + import 'sql_exporter/compute_logical_snapshot_files.libsonnet', + import 'sql_exporter/compute_receive_lsn.libsonnet', + import 'sql_exporter/compute_subscriptions_count.libsonnet', + import 'sql_exporter/connection_counts.libsonnet', + import 'sql_exporter/db_total_size.libsonnet', + import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', + import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', + import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', + import 'sql_exporter/getpage_sync_requests_total.libsonnet', + import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', + import 'sql_exporter/getpage_wait_seconds_count.libsonnet', + import 'sql_exporter/getpage_wait_seconds_sum.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size.libsonnet', + import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + import 'sql_exporter/logical_slot_restart_lsn.libsonnet', + import 'sql_exporter/max_cluster_size.libsonnet', + import 'sql_exporter/pageserver_disconnects_total.libsonnet', + import 'sql_exporter/pageserver_requests_sent_total.libsonnet', + import 'sql_exporter/pageserver_send_flushes_total.libsonnet', + import 'sql_exporter/pg_stats_userdb.libsonnet', + import 'sql_exporter/replication_delay_bytes.libsonnet', + import 'sql_exporter/replication_delay_seconds.libsonnet', + import 'sql_exporter/retained_wal.libsonnet', + import 'sql_exporter/wal_is_lost.libsonnet', + ], + queries: [ + { + query_name: 'neon_perf_counters', + query: importstr 'sql_exporter/neon_perf_counters.sql', + }, + ], +} diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml deleted file mode 100644 index 92da0cdbdd..0000000000 --- a/compute/etc/neon_collector.yml +++ /dev/null @@ -1,331 +0,0 @@ -collector_name: neon_collector -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: connection_counts - type: gauge - help: 'Connection counts' - key_labels: - - datname - - state - values: [count] - query: | - select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state; - -- metric_name: pg_stats_userdb - type: gauge - help: 'Stats for several oldest non-system dbs' - key_labels: - - datname - value_label: kind - values: - - db_size - - deadlocks - # Rows - - inserted - - updated - - deleted - # We export stats for 10 non-system database. Without this limit - # it is too easy to abuse the system by creating lots of databases. - query: | - select pg_database_size(datname) as db_size, deadlocks, - tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted, - datname - from pg_stat_database - where datname IN ( - select datname - from pg_database - where datname <> 'postgres' and not datistemplate - order by oid - limit 10 - ); - -- metric_name: max_cluster_size - type: gauge - help: 'neon.max_cluster_size setting' - key_labels: - values: [max_cluster_size] - query: | - select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size'; - -- metric_name: db_total_size - type: gauge - help: 'Size of all databases' - key_labels: - values: [total] - query: | - select sum(pg_database_size(datname)) as total from pg_database; - -- metric_name: getpage_wait_seconds_count - type: counter - help: 'Number of getpage requests' - values: [getpage_wait_seconds_count] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_sum - type: counter - help: 'Time spent in getpage requests' - values: [getpage_wait_seconds_sum] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_requests_total - type: counter - help: 'Number of getpage issued for prefetching' - values: [getpage_prefetch_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_sync_requests_total - type: counter - help: 'Number of synchronous getpage issued' - values: [getpage_sync_requests_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_misses_total - type: counter - help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read' - values: [getpage_prefetch_misses_total] - query_ref: neon_perf_counters - -- metric_name: getpage_prefetch_discards_total - type: counter - help: 'Number of prefetch responses issued but not used' - values: [getpage_prefetch_discards_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_requests_sent_total - type: counter - help: 'Number of all requests sent to the pageserver (not just GetPage requests)' - values: [pageserver_requests_sent_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_disconnects_total - type: counter - help: 'Number of times that the connection to the pageserver was lost' - values: [pageserver_disconnects_total] - query_ref: neon_perf_counters - -- metric_name: pageserver_send_flushes_total - type: counter - help: 'Number of flushes to the pageserver connection' - values: [pageserver_send_flushes_total] - query_ref: neon_perf_counters - -- metric_name: getpage_wait_seconds_bucket - type: counter - help: 'Histogram buckets of getpage request latency' - key_labels: - - bucket_le - values: [value] - query_ref: getpage_wait_seconds_buckets - -# DEPRECATED -- metric_name: lfc_approximate_working_set_size - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: - values: [approximate_working_set_size] - query: | - select neon.approximate_working_set_size(false) as approximate_working_set_size; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration] - values: [size] - # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection - # of durations in a pretty-printed form. - query: | - select - x as duration, - neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size - from - (values ('5m'),('15m'),('1h')) as t (x); - -- metric_name: compute_current_lsn - type: gauge - help: 'Current LSN of the database' - key_labels: - values: [lsn] - query: | - select - case - when pg_catalog.pg_is_in_recovery() - then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 - else (pg_current_wal_lsn() - '0/0')::FLOAT8 - end as lsn; - -- metric_name: compute_receive_lsn - type: gauge - help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication' - key_labels: - values: [lsn] - query: | - SELECT - CASE - WHEN pg_catalog.pg_is_in_recovery() - THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 - ELSE 0 - END AS lsn; - -- metric_name: replication_delay_bytes - type: gauge - help: 'Bytes between received and replayed LSN' - key_labels: - values: [replication_delay_bytes] - # We use a GREATEST call here because this calculation can be negative. - # The calculation is not atomic, meaning after we've gotten the receive - # LSN, the replay LSN may have advanced past the receive LSN we - # are using for the calculation. - query: | - SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; - -- metric_name: replication_delay_seconds - type: gauge - help: 'Time since last LSN was replayed' - key_labels: - values: [replication_delay_seconds] - query: | - SELECT - CASE - WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 - ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) - END AS replication_delay_seconds; - -- metric_name: checkpoints_req - type: gauge - help: 'Number of requested checkpoints' - key_labels: - values: [checkpoints_req] - query: | - SELECT checkpoints_req FROM pg_stat_bgwriter; - -- metric_name: checkpoints_timed - type: gauge - help: 'Number of scheduled checkpoints' - key_labels: - values: [checkpoints_timed] - query: | - SELECT checkpoints_timed FROM pg_stat_bgwriter; - -- metric_name: compute_logical_snapshot_files - type: gauge - help: 'Number of snapshot files in pg_logical/snapshot' - key_labels: - - timeline_id - values: [num_logical_snapshot_files] - query: | - SELECT - (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, - -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These - -- temporary snapshot files are renamed to the actual snapshot files after they are - -- completely built. We only WAL-log the completely built snapshot files. - (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; - -# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats. -# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly. - -# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad. -- metric_name: logical_slot_restart_lsn - type: gauge - help: 'restart_lsn of logical slots' - key_labels: - - slot_name - values: [restart_lsn] - query: | - select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn - from pg_replication_slots - where slot_type = 'logical'; - -- metric_name: compute_subscriptions_count - type: gauge - help: 'Number of logical replication subscriptions grouped by enabled/disabled' - key_labels: - - enabled - values: [subscriptions_count] - query: | - select subenabled::text as enabled, count(*) as subscriptions_count - from pg_subscription - group by subenabled; - -- metric_name: retained_wal - type: gauge - help: 'Retained WAL in inactive replication slots' - key_labels: - - slot_name - values: [retained_wal] - query: | - SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal - FROM pg_replication_slots - WHERE active = false; - -- metric_name: wal_is_lost - type: gauge - help: 'Whether or not the replication slot wal_status is lost' - key_labels: - - slot_name - values: [wal_is_lost] - query: | - SELECT slot_name, - CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost - FROM pg_replication_slots; - -queries: - - query_name: neon_perf_counters - query: | - WITH c AS ( - SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters - ) - SELECT d.* - FROM pg_catalog.jsonb_to_record((select jb from c)) as d( - getpage_wait_seconds_count numeric, - getpage_wait_seconds_sum numeric, - getpage_prefetch_requests_total numeric, - getpage_sync_requests_total numeric, - getpage_prefetch_misses_total numeric, - getpage_prefetch_discards_total numeric, - pageserver_requests_sent_total numeric, - pageserver_disconnects_total numeric, - pageserver_send_flushes_total numeric - ); - - - query_name: getpage_wait_seconds_buckets - query: | - SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/neon_collector_autoscaling.jsonnet b/compute/etc/neon_collector_autoscaling.jsonnet new file mode 100644 index 0000000000..e248172a3d --- /dev/null +++ b/compute/etc/neon_collector_autoscaling.jsonnet @@ -0,0 +1,11 @@ +{ + collector_name: 'neon_collector_autoscaling', + metrics: [ + import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet', + import 'sql_exporter/lfc_cache_size_limit.libsonnet', + import 'sql_exporter/lfc_hits.libsonnet', + import 'sql_exporter/lfc_misses.libsonnet', + import 'sql_exporter/lfc_used.libsonnet', + import 'sql_exporter/lfc_writes.libsonnet', + ], +} diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml deleted file mode 100644 index 5616264eba..0000000000 --- a/compute/etc/neon_collector_autoscaling.yml +++ /dev/null @@ -1,55 +0,0 @@ -collector_name: neon_collector_autoscaling -metrics: -- metric_name: lfc_misses - type: gauge - help: 'lfc_misses' - key_labels: - values: [lfc_misses] - query: | - select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses'; - -- metric_name: lfc_used - type: gauge - help: 'LFC chunks used (chunk = 1MB)' - key_labels: - values: [lfc_used] - query: | - select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used'; - -- metric_name: lfc_hits - type: gauge - help: 'lfc_hits' - key_labels: - values: [lfc_hits] - query: | - select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits'; - -- metric_name: lfc_writes - type: gauge - help: 'lfc_writes' - key_labels: - values: [lfc_writes] - query: | - select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes'; - -- metric_name: lfc_cache_size_limit - type: gauge - help: 'LFC cache size limit in bytes' - key_labels: - values: [lfc_cache_size_limit] - query: | - select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit; - -- metric_name: lfc_approximate_working_set_size_windows - type: gauge - help: 'Approximate working set size in pages of 8192 bytes' - key_labels: [duration_seconds] - values: [size] - # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set - # size looking back 1..60 minutes, labeled with the number of minutes. - query: | - select - x::text as duration_seconds, - neon.approximate_working_set_size_seconds(x) as size - from - (select generate_series * 60 as x from generate_series(1, 60)) as t (x); diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet new file mode 100644 index 0000000000..1e3665ac47 --- /dev/null +++ b/compute/etc/sql_exporter.jsonnet @@ -0,0 +1,40 @@ +function(collector_file, application_name='sql_exporter') { + // Configuration for sql_exporter for autoscaling-agent + // Global defaults. + global: { + // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. + scrape_timeout: '10s', + // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. + scrape_timeout_offset: '500ms', + // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. + min_interval: '0s', + // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, + // as will concurrent scrapes. + max_connections: 1, + // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should + // always be the same as max_connections. + max_idle_connections: 1, + // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. + // If 0, connections are not closed due to a connection's age. + max_connection_lifetime: '5m', + }, + + // The target to monitor and the collectors to execute on it. + target: { + // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) + // the schema gets dropped or replaced to match the driver expected DSN format. + data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]), + + // Collectors (referenced by name) to execute on the target. + // Glob patterns are supported (see for syntax). + collectors: [ + 'neon_collector_autoscaling', + ], + }, + + // Collector files specifies a list of globs. One collector definition is read from each matching file. + // Glob patterns are supported (see for syntax). + collector_files: [ + collector_file, + ], +} diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml deleted file mode 100644 index 139d04468a..0000000000 --- a/compute/etc/sql_exporter.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector.yml" diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet new file mode 100644 index 0000000000..8697f8af3b --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'checkpoints_req', + type: 'gauge', + help: 'Number of requested checkpoints', + key_labels: null, + values: [ + 'checkpoints_req', + ], + query: importstr 'sql_exporter/checkpoints_req.sql', +} diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql new file mode 100644 index 0000000000..eb8427c883 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.sql @@ -0,0 +1 @@ +SELECT checkpoints_req FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet new file mode 100644 index 0000000000..9f0b742400 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'checkpoints_timed', + type: 'gauge', + help: 'Number of scheduled checkpoints', + key_labels: null, + values: [ + 'checkpoints_timed', + ], + query: importstr 'sql_exporter/checkpoints_timed.sql', +} diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql new file mode 100644 index 0000000000..c50853134c --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.sql @@ -0,0 +1 @@ +SELECT checkpoints_timed FROM pg_stat_bgwriter; diff --git a/compute/etc/sql_exporter/compute_current_lsn.libsonnet b/compute/etc/sql_exporter/compute_current_lsn.libsonnet new file mode 100644 index 0000000000..ccff161358 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_current_lsn', + type: 'gauge', + help: 'Current LSN of the database', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_current_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql new file mode 100644 index 0000000000..be02b8a094 --- /dev/null +++ b/compute/etc/sql_exporter/compute_current_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8 + ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet new file mode 100644 index 0000000000..212f079ccf --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_logical_snapshot_files', + type: 'gauge', + help: 'Number of snapshot files in pg_logical/snapshot', + key_labels: [ + 'timeline_id', + ], + values: [ + 'num_logical_snapshot_files', + ], + query: importstr 'sql_exporter/compute_logical_snapshot_files.sql', +} diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql new file mode 100644 index 0000000000..f2454235b7 --- /dev/null +++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql @@ -0,0 +1,7 @@ +SELECT + (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id, + -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. + -- These temporary snapshot files are renamed to the actual snapshot files + -- after they are completely built. We only WAL-log the completely built + -- snapshot files + (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files; diff --git a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet new file mode 100644 index 0000000000..eb68a77ec2 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'compute_receive_lsn', + type: 'gauge', + help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication', + key_labels: null, + values: [ + 'lsn', + ], + query: importstr 'sql_exporter/compute_receive_lsn.sql', +} diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql new file mode 100644 index 0000000000..318b31ab41 --- /dev/null +++ b/compute/etc/sql_exporter/compute_receive_lsn.sql @@ -0,0 +1,4 @@ +SELECT CASE + WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8 + ELSE 0 +END AS lsn; diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet new file mode 100644 index 0000000000..e1575da397 --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'compute_subscriptions_count', + type: 'gauge', + help: 'Number of logical replication subscriptions grouped by enabled/disabled', + key_labels: [ + 'enabled', + ], + values: [ + 'subscriptions_count', + ], + query: importstr 'sql_exporter/compute_subscriptions_count.sql', +} diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql new file mode 100644 index 0000000000..50740cb5df --- /dev/null +++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql @@ -0,0 +1 @@ +SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled; diff --git a/compute/etc/sql_exporter/connection_counts.libsonnet b/compute/etc/sql_exporter/connection_counts.libsonnet new file mode 100644 index 0000000000..9f94db67a9 --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.libsonnet @@ -0,0 +1,13 @@ +{ + metric_name: 'connection_counts', + type: 'gauge', + help: 'Connection counts', + key_labels: [ + 'datname', + 'state', + ], + values: [ + 'count', + ], + query: importstr 'sql_exporter/connection_counts.sql', +} diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql new file mode 100644 index 0000000000..6824480fdb --- /dev/null +++ b/compute/etc/sql_exporter/connection_counts.sql @@ -0,0 +1 @@ +SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state; diff --git a/compute/etc/sql_exporter/db_total_size.libsonnet b/compute/etc/sql_exporter/db_total_size.libsonnet new file mode 100644 index 0000000000..6e08d5fb87 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'db_total_size', + type: 'gauge', + help: 'Size of all databases', + key_labels: null, + values: [ + 'total', + ], + query: importstr 'sql_exporter/db_total_size.sql', +} diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql new file mode 100644 index 0000000000..9cbbdfd8a3 --- /dev/null +++ b/compute/etc/sql_exporter/db_total_size.sql @@ -0,0 +1 @@ +SELECT sum(pg_database_size(datname)) AS total FROM pg_database; diff --git a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet new file mode 100644 index 0000000000..935e35d2e4 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_discards_total', + type: 'counter', + help: 'Number of prefetch responses issued but not used', + values: [ + 'getpage_prefetch_discards_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet new file mode 100644 index 0000000000..b9a9632105 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_misses_total', + type: 'counter', + help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read", + values: [ + 'getpage_prefetch_misses_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet new file mode 100644 index 0000000000..75fdb6717b --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetch_requests_total', + type: 'counter', + help: 'Number of getpage issued for prefetching', + values: [ + 'getpage_prefetch_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet new file mode 100644 index 0000000000..f3a1e6b339 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_sync_requests_total', + type: 'counter', + help: 'Number of synchronous getpage issued', + values: [ + 'getpage_sync_requests_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..2adda2ad03 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'getpage_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of getpage request latency', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql new file mode 100644 index 0000000000..b4a6bc1560 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..d2326974fc --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_count', + type: 'counter', + help: 'Number of getpage requests', + values: [ + 'getpage_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..844c8419ff --- /dev/null +++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_wait_seconds_sum', + type: 'counter', + help: 'Time spent in getpage requests', + values: [ + 'getpage_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet new file mode 100644 index 0000000000..78859ce60d --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet @@ -0,0 +1,12 @@ +// DEPRECATED + +{ + metric_name: 'lfc_approximate_working_set_size', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: null, + values: [ + 'approximate_working_set_size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql new file mode 100644 index 0000000000..de509ebb47 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql @@ -0,0 +1 @@ +SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size; diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet new file mode 100644 index 0000000000..a54deca467 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration_seconds', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql new file mode 100644 index 0000000000..35fa42c34c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "internal" / "machine-readable" version. This outputs the +-- working set size looking back 1..60 minutes, labeled with the number of +-- minutes. + +SELECT + x::text as duration_seconds, + neon.approximate_working_set_size_seconds(x) AS size +FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet new file mode 100644 index 0000000000..4970bd2c7f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'lfc_approximate_working_set_size_windows', + type: 'gauge', + help: 'Approximate working set size in pages of 8192 bytes', + key_labels: [ + 'duration', + ], + values: [ + 'size', + ], + query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql', +} diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql new file mode 100644 index 0000000000..46c7d1610c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql @@ -0,0 +1,8 @@ +-- NOTE: This is the "public" / "human-readable" version. Here, we supply a +-- small selection of durations in a pretty-printed form. + +SELECT + x AS duration, + neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM ( + VALUES ('5m'), ('15m'), ('1h') + ) AS t (x); diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet new file mode 100644 index 0000000000..4cbbd76621 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_cache_size_limit', + type: 'gauge', + help: 'LFC cache size limit in bytes', + key_labels: null, + values: [ + 'lfc_cache_size_limit', + ], + query: importstr 'sql_exporter/lfc_cache_size_limit.sql', +} diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql new file mode 100644 index 0000000000..378904c1fe --- /dev/null +++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql @@ -0,0 +1 @@ +SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit; diff --git a/compute/etc/sql_exporter/lfc_hits.libsonnet b/compute/etc/sql_exporter/lfc_hits.libsonnet new file mode 100644 index 0000000000..4a0b7671bf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_hits', + type: 'gauge', + help: 'lfc_hits', + key_labels: null, + values: [ + 'lfc_hits', + ], + query: importstr 'sql_exporter/lfc_hits.sql', +} diff --git a/compute/etc/sql_exporter/lfc_hits.sql b/compute/etc/sql_exporter/lfc_hits.sql new file mode 100644 index 0000000000..2e14f5c73c --- /dev/null +++ b/compute/etc/sql_exporter/lfc_hits.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits'; diff --git a/compute/etc/sql_exporter/lfc_misses.libsonnet b/compute/etc/sql_exporter/lfc_misses.libsonnet new file mode 100644 index 0000000000..302998d04f --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_misses', + type: 'gauge', + help: 'lfc_misses', + key_labels: null, + values: [ + 'lfc_misses', + ], + query: importstr 'sql_exporter/lfc_misses.sql', +} diff --git a/compute/etc/sql_exporter/lfc_misses.sql b/compute/etc/sql_exporter/lfc_misses.sql new file mode 100644 index 0000000000..27ed4ecf86 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_misses.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses'; diff --git a/compute/etc/sql_exporter/lfc_used.libsonnet b/compute/etc/sql_exporter/lfc_used.libsonnet new file mode 100644 index 0000000000..23891dadaf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_used', + type: 'gauge', + help: 'LFC chunks used (chunk = 1MB)', + key_labels: null, + values: [ + 'lfc_used', + ], + query: importstr 'sql_exporter/lfc_used.sql', +} diff --git a/compute/etc/sql_exporter/lfc_used.sql b/compute/etc/sql_exporter/lfc_used.sql new file mode 100644 index 0000000000..4f01545f30 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_used.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used'; diff --git a/compute/etc/sql_exporter/lfc_writes.libsonnet b/compute/etc/sql_exporter/lfc_writes.libsonnet new file mode 100644 index 0000000000..6a22ee1dd9 --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'lfc_writes', + type: 'gauge', + help: 'lfc_writes', + key_labels: null, + values: [ + 'lfc_writes', + ], + query: importstr 'sql_exporter/lfc_writes.sql', +} diff --git a/compute/etc/sql_exporter/lfc_writes.sql b/compute/etc/sql_exporter/lfc_writes.sql new file mode 100644 index 0000000000..37c9abc9cf --- /dev/null +++ b/compute/etc/sql_exporter/lfc_writes.sql @@ -0,0 +1 @@ +SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes'; diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet new file mode 100644 index 0000000000..8ef31b5d8d --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet @@ -0,0 +1,15 @@ +// Number of slots is limited by max_replication_slots, so collecting position +// for all of them shouldn't be bad. + +{ + metric_name: 'logical_slot_restart_lsn', + type: 'gauge', + help: 'restart_lsn of logical slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'restart_lsn', + ], + query: importstr 'sql_exporter/logical_slot_restart_lsn.sql', +} diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql new file mode 100644 index 0000000000..1b1c038501 --- /dev/null +++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql @@ -0,0 +1,3 @@ +SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn +FROM pg_replication_slots +WHERE slot_type = 'logical'; diff --git a/compute/etc/sql_exporter/max_cluster_size.libsonnet b/compute/etc/sql_exporter/max_cluster_size.libsonnet new file mode 100644 index 0000000000..1352fb77ee --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'max_cluster_size', + type: 'gauge', + help: 'neon.max_cluster_size setting', + key_labels: null, + values: [ + 'max_cluster_size', + ], + query: importstr 'sql_exporter/max_cluster_size.sql', +} diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql new file mode 100644 index 0000000000..2d2355a9a7 --- /dev/null +++ b/compute/etc/sql_exporter/max_cluster_size.sql @@ -0,0 +1 @@ +SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size'; diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql new file mode 100644 index 0000000000..58998907a0 --- /dev/null +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -0,0 +1,13 @@ +WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) + +SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( + getpage_wait_seconds_count numeric, + getpage_wait_seconds_sum numeric, + getpage_prefetch_requests_total numeric, + getpage_sync_requests_total numeric, + getpage_prefetch_misses_total numeric, + getpage_prefetch_discards_total numeric, + pageserver_requests_sent_total numeric, + pageserver_disconnects_total numeric, + pageserver_send_flushes_total numeric +); diff --git a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet new file mode 100644 index 0000000000..5ad9ba078e --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_disconnects_total', + type: 'counter', + help: 'Number of times that the connection to the pageserver was lost', + values: [ + 'pageserver_disconnects_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet new file mode 100644 index 0000000000..c191e2467f --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_requests_sent_total', + type: 'counter', + help: 'Number of all requests sent to the pageserver (not just GetPage requests)', + values: [ + 'pageserver_requests_sent_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet new file mode 100644 index 0000000000..9fa5f77758 --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_send_flushes_total', + type: 'counter', + help: 'Number of flushes to the pageserver connection', + values: [ + 'pageserver_send_flushes_total', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet new file mode 100644 index 0000000000..46ea2f4192 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet @@ -0,0 +1,18 @@ +{ + metric_name: 'pg_stats_userdb', + type: 'gauge', + help: 'Stats for several oldest non-system dbs', + key_labels: [ + 'datname', + ], + value_label: 'kind', + values: [ + 'db_size', + 'deadlocks', + // Rows + 'inserted', + 'updated', + 'deleted', + ], + query: importstr 'sql_exporter/pg_stats_userdb.sql', +} diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql new file mode 100644 index 0000000000..00ada87370 --- /dev/null +++ b/compute/etc/sql_exporter/pg_stats_userdb.sql @@ -0,0 +1,10 @@ +-- We export stats for 10 non-system databases. Without this limit it is too +-- easy to abuse the system by creating lots of databases. + +SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted, + tup_updated AS updated, tup_deleted AS deleted, datname +FROM pg_stat_database +WHERE datname IN ( + SELECT datname FROM pg_database + WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10 +); diff --git a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet new file mode 100644 index 0000000000..3e5bb6af1f --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_bytes', + type: 'gauge', + help: 'Bytes between received and replayed LSN', + key_labels: null, + values: [ + 'replication_delay_bytes', + ], + query: importstr 'sql_exporter/replication_delay_bytes.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql new file mode 100644 index 0000000000..60a6981acd --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_bytes.sql @@ -0,0 +1,6 @@ +-- We use a GREATEST call here because this calculation can be negative. The +-- calculation is not atomic, meaning after we've gotten the receive LSN, the +-- replay LSN may have advanced past the receive LSN we are using for the +-- calculation. + +SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes; diff --git a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet new file mode 100644 index 0000000000..d3f2c21b54 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet @@ -0,0 +1,10 @@ +{ + metric_name: 'replication_delay_seconds', + type: 'gauge', + help: 'Time since last LSN was replayed', + key_labels: null, + values: [ + 'replication_delay_seconds', + ], + query: importstr 'sql_exporter/replication_delay_seconds.sql', +} diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql new file mode 100644 index 0000000000..a76809ad74 --- /dev/null +++ b/compute/etc/sql_exporter/replication_delay_seconds.sql @@ -0,0 +1,5 @@ +SELECT + CASE + WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 + ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp())) + END AS replication_delay_seconds; diff --git a/compute/etc/sql_exporter/retained_wal.libsonnet b/compute/etc/sql_exporter/retained_wal.libsonnet new file mode 100644 index 0000000000..f9eff5faa5 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'retained_wal', + type: 'gauge', + help: 'Retained WAL in inactive replication slots', + key_labels: [ + 'slot_name', + ], + values: [ + 'retained_wal', + ], + query: importstr 'sql_exporter/retained_wal.sql', +} diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql new file mode 100644 index 0000000000..6c58359461 --- /dev/null +++ b/compute/etc/sql_exporter/retained_wal.sql @@ -0,0 +1,5 @@ +SELECT + slot_name, + pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal +FROM pg_replication_slots +WHERE active = false; diff --git a/compute/etc/sql_exporter/wal_is_lost.libsonnet b/compute/etc/sql_exporter/wal_is_lost.libsonnet new file mode 100644 index 0000000000..3cd25f4b39 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'wal_is_lost', + type: 'gauge', + help: 'Whether or not the replication slot wal_status is lost', + key_labels: [ + 'slot_name', + ], + values: [ + 'wal_is_lost', + ], + query: importstr 'sql_exporter/wal_is_lost.sql', +} diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql new file mode 100644 index 0000000000..5521270851 --- /dev/null +++ b/compute/etc/sql_exporter/wal_is_lost.sql @@ -0,0 +1,7 @@ +SELECT + slot_name, + CASE + WHEN wal_status = 'lost' THEN 1 + ELSE 0 + END AS wal_is_lost +FROM pg_replication_slots; diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml deleted file mode 100644 index 044557233e..0000000000 --- a/compute/etc/sql_exporter_autoscaling.yml +++ /dev/null @@ -1,33 +0,0 @@ -# Configuration for sql_exporter for autoscaling-agent -# Global defaults. -global: - # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s. - scrape_timeout: 10s - # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first. - scrape_timeout_offset: 500ms - # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape. - min_interval: 0s - # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections, - # as will concurrent scrapes. - max_connections: 1 - # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should - # always be the same as max_connections. - max_idle_connections: 1 - # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse. - # If 0, connections are not closed due to a connection's age. - max_connection_lifetime: 5m - -# The target to monitor and the collectors to execute on it. -target: - # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) - # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' - - # Collectors (referenced by name) to execute on the target. - # Glob patterns are supported (see for syntax). - collectors: [neon_collector_autoscaling] - -# Collector files specifies a list of globs. One collector definition is read from each matching file. -# Glob patterns are supported (see for syntax). -collector_files: - - "neon_collector_autoscaling.yml" From f1eb7032569c35ec47806c5e736486508d559439 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Tue, 15 Oct 2024 16:35:21 -0400 Subject: [PATCH 30/48] fix(pageserver): use a buffer for basebackup; add aux basebackup metrics log (#9401) Our replication bench project is stuck because it is too slow to generate basebackup and it caused compute to disconnect. https://neondb.slack.com/archives/C03438W3FLZ/p1728330685012419 The compute timeout for waiting for basebackup is 10m (is it true?). Generating basebackup directly on pageserver takes ~3min. Therefore, I suspect it's because there are too many wasted round-trip time for writing the 10000+ snapshot aux files. Also, it is possible that the basebackup process takes too long time retrieving all aux files that it did not write anything over the wire protocol, causing a read timeout. Basebackup size is 800KB gzipped for that project and was 55MB tar before compression. ## Summary of changes * Potentially fix the issue by placing a write buffer for basebackup. * Log how many aux files did we read + the time spent on it. Signed-off-by: Alex Chi Z --- pageserver/src/basebackup.rs | 21 +++++++++++++++++---- pageserver/src/page_service.rs | 10 +++++++--- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index a32d09f3b3..975318419f 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -16,7 +16,7 @@ use fail::fail_point; use pageserver_api::key::Key; use postgres_ffi::pg_constants; use std::fmt::Write as FmtWrite; -use std::time::SystemTime; +use std::time::{Instant, SystemTime}; use tokio::io; use tokio::io::AsyncWrite; use tracing::*; @@ -352,12 +352,25 @@ where } } - for (path, content) in self + let start_time = Instant::now(); + let aux_files = self .timeline .list_aux_files(self.lsn, self.ctx) .await - .map_err(|e| BasebackupError::Server(e.into()))? - { + .map_err(|e| BasebackupError::Server(e.into()))?; + let aux_scan_time = start_time.elapsed(); + let aux_estimated_size = aux_files + .values() + .map(|content| content.len()) + .sum::(); + info!( + "Scanned {} aux files in {}ms, aux file content size = {}", + aux_files.len(), + aux_scan_time.as_millis(), + aux_estimated_size + ); + + for (path, content) in aux_files { if path.starts_with("pg_replslot") { let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; let restart_lsn = Lsn(u64::from_le_bytes( diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 8fa6b9a7f0..afb2f92ff8 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -26,8 +26,8 @@ use std::str::FromStr; use std::sync::Arc; use std::time::SystemTime; use std::time::{Duration, Instant}; -use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::io::{AsyncWriteExt, BufWriter}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -1137,10 +1137,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } else { - let mut writer = pgb.copyout_writer(); + let mut writer = BufWriter::new(pgb.copyout_writer()); if gzip { let mut encoder = GzipEncoder::with_quality( - writer, + &mut writer, // NOTE using fast compression because it's on the critical path // for compute startup. For an empty database, we get // <100KB with this method. The Level::Best compression method @@ -1175,6 +1175,10 @@ impl PageServerHandler { .await .map_err(map_basebackup_error)?; } + writer + .flush() + .await + .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?; } pgb.write_message_noflush(&BeMessage::CopyDone) From 18f4e5f10cd1eeaa5a5949f9a6130983691311d6 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 15 Oct 2024 23:13:31 +0200 Subject: [PATCH 31/48] Add newly added metrics from neondatabase/neon#9116 to exports (#9402) They weren't added in that PR, but should be available immediately on rollout as the neon extension already defaults to 1.5. --- compute/etc/neon_collector.jsonnet | 8 ++++++++ .../file_cache_read_wait_seconds_bucket.libsonnet | 12 ++++++++++++ .../file_cache_read_wait_seconds_bucket.sql | 1 + .../file_cache_read_wait_seconds_count.libsonnet | 9 +++++++++ .../file_cache_read_wait_seconds_sum.libsonnet | 9 +++++++++ .../file_cache_write_wait_seconds_bucket.libsonnet | 12 ++++++++++++ .../file_cache_write_wait_seconds_bucket.sql | 1 + .../file_cache_write_wait_seconds_count.libsonnet | 9 +++++++++ .../file_cache_write_wait_seconds_sum.libsonnet | 9 +++++++++ .../getpage_prefetches_buffered.libsonnet | 9 +++++++++ compute/etc/sql_exporter/neon_perf_counters.sql | 8 +++++++- .../sql_exporter/pageserver_open_requests.libsonnet | 9 +++++++++ 12 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet create mode 100644 compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet create mode 100644 compute/etc/sql_exporter/pageserver_open_requests.libsonnet diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet index 2031eb8c85..8b43ebe7a3 100644 --- a/compute/etc/neon_collector.jsonnet +++ b/compute/etc/neon_collector.jsonnet @@ -9,9 +9,16 @@ import 'sql_exporter/compute_subscriptions_count.libsonnet', import 'sql_exporter/connection_counts.libsonnet', import 'sql_exporter/db_total_size.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet', + import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet', import 'sql_exporter/getpage_prefetch_discards_total.libsonnet', import 'sql_exporter/getpage_prefetch_misses_total.libsonnet', import 'sql_exporter/getpage_prefetch_requests_total.libsonnet', + import 'sql_exporter/getpage_prefetches_buffered.libsonnet', import 'sql_exporter/getpage_sync_requests_total.libsonnet', import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet', import 'sql_exporter/getpage_wait_seconds_count.libsonnet', @@ -28,6 +35,7 @@ import 'sql_exporter/pageserver_disconnects_total.libsonnet', import 'sql_exporter/pageserver_requests_sent_total.libsonnet', import 'sql_exporter/pageserver_send_flushes_total.libsonnet', + import 'sql_exporter/pageserver_open_requests.libsonnet', import 'sql_exporter/pg_stats_userdb.libsonnet', import 'sql_exporter/replication_delay_bytes.libsonnet', import 'sql_exporter/replication_delay_seconds.libsonnet', diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..d13f657a7f --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_read_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC read operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql new file mode 100644 index 0000000000..09047bf0c4 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..aa028b0f5e --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_count', + type: 'counter', + help: 'Number of read operations in LFC', + values: [ + 'file_cache_read_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..2547aabf3d --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_read_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC read operations', + values: [ + 'file_cache_read_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet new file mode 100644 index 0000000000..13dbc77f76 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet @@ -0,0 +1,12 @@ +{ + metric_name: 'file_cache_write_wait_seconds_bucket', + type: 'counter', + help: 'Histogram buckets of LFC write operation latencies', + key_labels: [ + 'bucket_le', + ], + values: [ + 'value', + ], + query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql new file mode 100644 index 0000000000..d03613cf91 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql @@ -0,0 +1 @@ +SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket'; diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet new file mode 100644 index 0000000000..6227d3193a --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_count', + type: 'counter', + help: 'Number of write operations in LFC', + values: [ + 'file_cache_write_wait_seconds_count', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet new file mode 100644 index 0000000000..2acfe7f608 --- /dev/null +++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'file_cache_write_wait_seconds_sum', + type: 'counter', + help: 'Time spent in LFC write operations', + values: [ + 'file_cache_write_wait_seconds_sum', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet new file mode 100644 index 0000000000..8926d867c9 --- /dev/null +++ b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'getpage_prefetches_buffered', + type: 'gauge', + help: 'Number of prefetched pages buffered in neon', + values: [ + 'getpage_prefetches_buffered', + ], + query_ref: 'neon_perf_counters', +} diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql index 58998907a0..4a36f3bf2f 100644 --- a/compute/etc/sql_exporter/neon_perf_counters.sql +++ b/compute/etc/sql_exporter/neon_perf_counters.sql @@ -1,13 +1,19 @@ WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters) SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d( + file_cache_read_wait_seconds_count numeric, + file_cache_read_wait_seconds_sum numeric, + file_cache_write_wait_seconds_count numeric, + file_cache_write_wait_seconds_sum numeric, getpage_wait_seconds_count numeric, getpage_wait_seconds_sum numeric, getpage_prefetch_requests_total numeric, getpage_sync_requests_total numeric, getpage_prefetch_misses_total numeric, getpage_prefetch_discards_total numeric, + getpage_prefetches_buffered numeric, pageserver_requests_sent_total numeric, pageserver_disconnects_total numeric, - pageserver_send_flushes_total numeric + pageserver_send_flushes_total numeric, + pageserver_open_requests numeric ); diff --git a/compute/etc/sql_exporter/pageserver_open_requests.libsonnet b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet new file mode 100644 index 0000000000..dca89ea64a --- /dev/null +++ b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet @@ -0,0 +1,9 @@ +{ + metric_name: 'pageserver_open_requests', + type: 'gauge', + help: 'Number of open requests to PageServer', + values: [ + 'pageserver_open_requests', + ], + query_ref: 'neon_perf_counters', +} From be5d6a69dc6a05d339235d00958eb9fea7b0e9f5 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 15 Oct 2024 16:30:31 -0500 Subject: [PATCH 32/48] Fix jsonnet_files wildcard Just a typo in a path. Signed-off-by: Tristan Partin --- compute/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/compute/Makefile b/compute/Makefile index 45fbfa6d5e..b407fc60be 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -1,4 +1,6 @@ -jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet) +jsonnet_files = $(wildcard \ + etc/*.jsonnet \ + etc/sql_exporter/*.libsonnet) .PHONY: all all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml From 061ea0de7a9768716d941e2e3472f19e075a5ce5 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Tue, 15 Oct 2024 20:01:13 -0500 Subject: [PATCH 33/48] Add jsonnetfmt targets This should make it a little bit easier for people wanting to check if their files are formated correctly. Has the added bonus of making the CI check simpler as well. Signed-off-by: Tristan Partin --- .github/workflows/build_and_test.yml | 3 +-- compute/Makefile | 8 ++++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c9a447626f..faee1d89e1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -136,8 +136,7 @@ jobs: - name: Check Jsonnet code formatting run: | - jsonnetfmt --test \ - $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet') + make -C compute jsonnetfmt-test # Check that the vendor/postgres-* submodules point to the # corresponding REL_*_STABLE_neon branches. diff --git a/compute/Makefile b/compute/Makefile index b407fc60be..f8faa882ee 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -35,3 +35,11 @@ clean: etc/neon_collector_autoscaling.yml \ etc/sql_exporter.yml \ etc/sql_exporter_autoscaling.yml + +.PHONY: jsonnetfmt-test +jsonnetfmt-test: + jsonnetfmt --test $(jsonnet_files) + +.PHONY: jsonnetfmt-format +jsonnetfmt-format: + jsonnetfmt --in-place $(jsonnet_files) From bc6b8cee01cc4055332fef052c048856612bcbab Mon Sep 17 00:00:00 2001 From: Cihan Demirci <128653800+fcdm@users.noreply.github.com> Date: Wed, 16 Oct 2024 10:43:48 +0100 Subject: [PATCH 34/48] don't trigger workflows in two repos (#9340) https://github.com/neondatabase/cloud/issues/16723 --- .github/workflows/build_and_test.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index faee1d89e1..b669eaeb11 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1100,7 +1100,6 @@ jobs: run: | if [[ "$GITHUB_REF_NAME" == "main" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false - gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}} elif [[ "$GITHUB_REF_NAME" == "release" ]]; then gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \ -f deployPgSniRouter=false \ From 89a65a9e5a30c7525d165d1a9c2675d05811bfcb Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 16 Oct 2024 13:39:58 +0100 Subject: [PATCH 35/48] pageserver: improve handling of archival_config calls during Timeline shutdown (#9415) ## Problem In test `test_timeline_offloading`, we see failures like: ``` PageserverApiException: queue is in state Stopped ``` Example failure: https://neon-github-public-dev.s3.amazonaws.com/reports/main/11356917668/index.html#testresult/ff0e348a78a974ee/retries ## Summary of changes - Amend code paths that handle errors from RemoteTimelineClient to check for cancellation and emit the Cancelled error variant in these cases (will give clients a 503 to retry) - Remove the implicit `#[from]` for the Other error case, to make it harder to add code that accidentally squashes errors into this (500-equivalent) error variant. This would be neater if we made RemoteTimelineClient return a structured error instead of anyhow::Error, but that's a bigger refactor. I'm not sure if the test really intends to hit this path, but the error handling fix makes sense either way. --- pageserver/src/tenant.rs | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 44d1bb74ca..20925c7fd6 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -67,7 +67,7 @@ use self::metadata::TimelineMetadata; use self::mgr::GetActiveTenantError; use self::mgr::GetTenantError; use self::remote_timeline_client::upload::upload_index_part; -use self::remote_timeline_client::RemoteTimelineClient; +use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError}; use self::timeline::uninit::TimelineCreateGuard; use self::timeline::uninit::TimelineExclusionError; use self::timeline::uninit::UninitializedTimeline; @@ -632,7 +632,7 @@ pub enum TimelineArchivalError { AlreadyInProgress, #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl Debug for TimelineArchivalError { @@ -1602,7 +1602,8 @@ impl Tenant { "failed to load remote timeline {} for tenant {}", timeline_id, self.tenant_shard_id ) - })?; + }) + .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); if let Some(timeline) = timelines.get(&timeline_id) { let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); @@ -1672,9 +1673,19 @@ impl Tenant { }; // Third part: upload new timeline archival state and block until it is present in S3 - let upload_needed = timeline + let upload_needed = match timeline .remote_client - .schedule_index_upload_for_timeline_archival_state(new_state)?; + .schedule_index_upload_for_timeline_archival_state(new_state) + { + Ok(upload_needed) => upload_needed, + Err(e) => { + if timeline.cancel.is_cancelled() { + return Err(TimelineArchivalError::Cancelled); + } else { + return Err(TimelineArchivalError::Other(e)); + } + } + }; if upload_needed { info!("Uploading new state"); @@ -1685,7 +1696,14 @@ impl Tenant { tracing::warn!("reached timeout for waiting on upload queue"); return Err(TimelineArchivalError::Timeout); }; - v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?; + v.map_err(|e| match e { + WaitCompletionError::NotInitialized(e) => { + TimelineArchivalError::Other(anyhow::anyhow!(e)) + } + WaitCompletionError::UploadQueueShutDownOrStopped => { + TimelineArchivalError::Cancelled + } + })?; } Ok(()) } From f14e45f0cee38bfbbbf1141d486fdd8edfbcc2f2 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Wed, 16 Oct 2024 15:01:56 +0200 Subject: [PATCH 36/48] proxy: format imports with nightly rustfmt (#9414) ```shell cargo +nightly fmt -p proxy -- -l --config imports_granularity=Module,group_imports=StdExternalCrate,reorder_imports=true ``` These rust-analyzer settings for VSCode should help retain this style: ```json "rust-analyzer.imports.group.enable": true, "rust-analyzer.imports.prefix": "crate", "rust-analyzer.imports.merge.glob": false, "rust-analyzer.imports.granularity.group": "module", "rust-analyzer.imports.granularity.enforce": true, ``` --- proxy/src/auth/backend/classic.rs | 19 +++-- proxy/src/auth/backend/console_redirect.rs | 21 +++-- proxy/src/auth/backend/hacks.rs | 19 +++-- proxy/src/auth/backend/jwt.rs | 39 +++++---- proxy/src/auth/backend/local.rs | 19 ++--- proxy/src/auth/backend/mod.rs | 61 ++++++-------- proxy/src/auth/credentials.rs | 25 +++--- proxy/src/auth/flow.rs | 25 +++--- proxy/src/auth/mod.rs | 12 +-- proxy/src/bin/local_proxy.rs | 50 ++++++------ proxy/src/bin/pg_sni_router.rs | 16 ++-- proxy/src/bin/proxy.rs | 51 +++++------- proxy/src/cache/endpoints.rs | 34 +++----- proxy/src/cache/project_info.rs | 27 +++---- proxy/src/cache/timed_lru.rs | 13 ++- proxy/src/cancellation.rs | 14 ++-- proxy/src/compute.rs | 30 ++++--- proxy/src/config.rs | 37 ++++----- proxy/src/console_redirect_proxy.rs | 29 +++---- proxy/src/context/mod.rs | 21 ++--- proxy/src/context/parquet.rs | 49 ++++++------ proxy/src/control_plane/messages.rs | 9 ++- proxy/src/control_plane/mgmt.rs | 10 +-- proxy/src/control_plane/provider/mock.rs | 39 ++++----- proxy/src/control_plane/provider/mod.rs | 47 +++++------ proxy/src/control_plane/provider/neon.rs | 42 +++++----- proxy/src/error.rs | 3 +- proxy/src/http/health_server.rs | 25 +++--- proxy/src/http/mod.rs | 19 +++-- proxy/src/intern.rs | 14 ++-- proxy/src/jemalloc.rs | 16 ++-- proxy/src/logging.rs | 16 ++-- proxy/src/metrics.rs | 8 +- proxy/src/protocol2.rs | 10 +-- proxy/src/proxy/connect_compute.rs | 29 ++++--- proxy/src/proxy/copy_bidirectional.rs | 9 ++- proxy/src/proxy/handshake.rs | 20 +++-- proxy/src/proxy/mod.rs | 36 ++++----- proxy/src/proxy/passthrough.rs | 14 ++-- proxy/src/proxy/retry.rs | 8 +- proxy/src/proxy/tests/mitm.rs | 3 +- proxy/src/proxy/tests/mod.rs | 22 ++--- proxy/src/proxy/wake_compute.rs | 11 +-- proxy/src/rate_limiter/leaky_bucket.rs | 6 +- proxy/src/rate_limiter/limit_algorithm.rs | 12 +-- .../src/rate_limiter/limit_algorithm/aimd.rs | 3 +- proxy/src/rate_limiter/limiter.rs | 24 +++--- proxy/src/rate_limiter/mod.rs | 4 +- proxy/src/redis/cancellation_publisher.rs | 7 +- .../connection_with_credentials_provider.rs | 9 +-- proxy/src/redis/notifications.rs | 17 ++-- proxy/src/sasl/messages.rs | 3 +- proxy/src/sasl/mod.rs | 5 +- proxy/src/sasl/stream.rs | 7 +- proxy/src/scram/countmin.rs | 4 +- proxy/src/scram/exchange.rs | 3 +- proxy/src/scram/messages.rs | 5 +- proxy/src/scram/mod.rs | 15 ++-- proxy/src/scram/pbkdf2.rs | 10 +-- proxy/src/scram/threadpool.rs | 32 +++----- proxy/src/serverless/backend.rs | 58 ++++++-------- proxy/src/serverless/cancel_set.rs | 8 +- proxy/src/serverless/conn_pool.rs | 44 +++++----- proxy/src/serverless/http_conn_pool.rs | 17 ++-- proxy/src/serverless/http_util.rs | 7 +- proxy/src/serverless/json.rs | 9 +-- proxy/src/serverless/local_conn_pool.rs | 25 +++--- proxy/src/serverless/mod.rs | 19 +++-- proxy/src/serverless/sql_over_http.rs | 80 ++++++------------- proxy/src/serverless/websocket.rs | 41 ++++------ proxy/src/stream.rs | 17 ++-- proxy/src/usage_metrics.rs | 41 +++++----- proxy/src/waiters.rs | 8 +- 73 files changed, 726 insertions(+), 835 deletions(-) diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 94b84b6f00..de32a06e9e 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo}; -use crate::{ - auth::{self, backend::ComputeCredentialKeys, AuthFlow}, - compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - sasl, - stream::{PqStream, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::backend::ComputeCredentialKeys; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::stream::{PqStream, Stream}; +use crate::{compute, sasl}; + pub(super) async fn authenticate( ctx: &RequestMonitoring, creds: ComputeUserInfo, diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs index 457410ec8c..255e1fed54 100644 --- a/proxy/src/auth/backend/console_redirect.rs +++ b/proxy/src/auth/backend/console_redirect.rs @@ -1,15 +1,3 @@ -use crate::{ - auth, - cache::Cached, - compute, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{self, provider::NodeInfo, CachedNodeInfo}, - error::{ReportableError, UserFacingError}, - proxy::connect_compute::ComputeConnectBackend, - stream::PqStream, - waiters, -}; use async_trait::async_trait; use pq_proto::BeMessage as Be; use thiserror::Error; @@ -18,6 +6,15 @@ use tokio_postgres::config::SslMode; use tracing::{info, info_span}; use super::ComputeCredentialKeys; +use crate::cache::Cached; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::provider::NodeInfo; +use crate::control_plane::{self, CachedNodeInfo}; +use crate::error::{ReportableError, UserFacingError}; +use crate::proxy::connect_compute::ComputeConnectBackend; +use crate::stream::PqStream; +use crate::{auth, compute, waiters}; #[derive(Debug, Error)] pub(crate) enum WebAuthError { diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 749218d260..8ab8d5d37f 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -1,16 +1,15 @@ -use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; -use crate::{ - auth::{self, AuthFlow}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - stream::{self, Stream}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; +use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; +use crate::auth::{self, AuthFlow}; +use crate::config::AuthenticationConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::stream::{self, Stream}; + /// Compared to [SCRAM](crate::scram), cleartext password auth saves /// one round trip and *expensive* computations (>= 4096 HMAC iterations). /// These properties are benefical for serverless JS workers, so we diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 402e59fdb3..3f53ee24c3 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -1,22 +1,22 @@ -use std::{ - future::Future, - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::future::Future; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use arc_swap::ArcSwapOption; use dashmap::DashMap; use jose_jwk::crypto::KeyInfo; -use serde::{de::Visitor, Deserialize, Deserializer}; +use serde::de::Visitor; +use serde::{Deserialize, Deserializer}; use signature::Verifier; use thiserror::Error; use tokio::time::Instant; -use crate::{ - auth::backend::ComputeCredentialKeys, context::RequestMonitoring, - control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit, - intern::RoleNameInt, EndpointId, RoleName, -}; +use crate::auth::backend::ComputeCredentialKeys; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::http::parse_json_body_with_limit; +use crate::intern::RoleNameInt; +use crate::{EndpointId, RoleName}; // TODO(conrad): make these configurable. const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30); @@ -381,10 +381,8 @@ fn verify_rsa_signature( alg: &jose_jwa::Algorithm, ) -> Result<(), JwtError> { use jose_jwa::{Algorithm, Signing}; - use rsa::{ - pkcs1v15::{Signature, VerifyingKey}, - RsaPublicKey, - }; + use rsa::pkcs1v15::{Signature, VerifyingKey}; + use rsa::RsaPublicKey; let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?; @@ -655,11 +653,9 @@ impl From<&jose_jwk::Key> for KeyType { #[cfg(test)] mod tests { - use crate::RoleName; - - use super::*; - - use std::{future::IntoFuture, net::SocketAddr, time::SystemTime}; + use std::future::IntoFuture; + use std::net::SocketAddr; + use std::time::SystemTime; use base64::URL_SAFE_NO_PAD; use bytes::Bytes; @@ -672,6 +668,9 @@ mod tests { use signature::Signer; use tokio::net::TcpListener; + use super::*; + use crate::RoleName; + fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) { let sk = p256::SecretKey::random(&mut OsRng); let pk = sk.public_key().into(); diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs index 1dea4d2d73..e3995ac6c0 100644 --- a/proxy/src/auth/backend/local.rs +++ b/proxy/src/auth/backend/local.rs @@ -2,19 +2,14 @@ use std::net::SocketAddr; use arc_swap::ArcSwapOption; -use crate::{ - auth::backend::jwt::FetchAuthRulesError, - compute::ConnCfg, - context::RequestMonitoring, - control_plane::{ - messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}, - NodeInfo, - }, - intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}, - EndpointId, -}; - use super::jwt::{AuthRule, FetchAuthRules}; +use crate::auth::backend::jwt::FetchAuthRulesError; +use crate::compute::ConnCfg; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo}; +use crate::control_plane::NodeInfo; +use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag}; +use crate::EndpointId; pub struct LocalBackend { pub(crate) node_info: NodeInfo, diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs index 7cf158bcd9..a4db130b61 100644 --- a/proxy/src/auth/backend/mod.rs +++ b/proxy/src/auth/backend/mod.rs @@ -17,29 +17,22 @@ use tokio_postgres::config::AuthKeys; use tracing::{info, warn}; use crate::auth::credentials::check_peer_addr_is_in_list; -use crate::auth::{validate_password_and_exchange, AuthError}; +use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint}; use crate::cache::Cached; +use crate::config::AuthenticationConfig; use crate::context::RequestMonitoring; use crate::control_plane::errors::GetAuthInfoError; -use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend}; -use crate::control_plane::AuthSecret; +use crate::control_plane::provider::{ + CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend, +}; +use crate::control_plane::{self, Api, AuthSecret}; use crate::intern::EndpointIdInt; use crate::metrics::Metrics; use crate::proxy::connect_compute::ComputeConnectBackend; use crate::proxy::NeonOptions; use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo}; use crate::stream::Stream; -use crate::{ - auth::{self, ComputeUserInfoMaybeEndpoint}, - config::AuthenticationConfig, - control_plane::{ - self, - provider::{CachedAllowedIps, CachedNodeInfo}, - Api, - }, - stream, -}; -use crate::{scram, EndpointCacheKey, EndpointId, RoleName}; +use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName}; /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality pub enum MaybeOwned<'a, T> { @@ -500,34 +493,32 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> { #[cfg(test)] mod tests { - use std::{net::IpAddr, sync::Arc, time::Duration}; + use std::net::IpAddr; + use std::sync::Arc; + use std::time::Duration; use bytes::BytesMut; use fallible_iterator::FallibleIterator; use once_cell::sync::Lazy; - use postgres_protocol::{ - authentication::sasl::{ChannelBinding, ScramSha256}, - message::{backend::Message as PgMessage, frontend}, - }; + use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256}; + use postgres_protocol::message::backend::Message as PgMessage; + use postgres_protocol::message::frontend; use provider::AuthSecret; use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt}; - use crate::{ - auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern}, - config::AuthenticationConfig, - context::RequestMonitoring, - control_plane::{ - self, - provider::{self, CachedAllowedIps, CachedRoleSecret}, - CachedNodeInfo, - }, - proxy::NeonOptions, - rate_limiter::{EndpointRateLimiter, RateBucketInfo}, - scram::{threadpool::ThreadPool, ServerSecret}, - stream::{PqStream, Stream}, - }; - - use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter}; + use super::jwt::JwkCache; + use super::{auth_quirks, AuthRateLimiter}; + use crate::auth::backend::MaskedIp; + use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern}; + use crate::config::AuthenticationConfig; + use crate::context::RequestMonitoring; + use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret}; + use crate::control_plane::{self, CachedNodeInfo}; + use crate::proxy::NeonOptions; + use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo}; + use crate::scram::threadpool::ThreadPool; + use crate::scram::ServerSecret; + use crate::stream::{PqStream, Stream}; struct Auth { ips: Vec, diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index cba8601d14..fa6bc4c6f5 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,20 +1,22 @@ //! User credentials used in authentication. -use crate::{ - auth::password_hack::parse_endpoint_param, - context::RequestMonitoring, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, SniKind}, - proxy::NeonOptions, - serverless::SERVERLESS_DRIVER_SNI, - EndpointId, RoleName, -}; +use std::collections::HashSet; +use std::net::IpAddr; +use std::str::FromStr; + use itertools::Itertools; use pq_proto::StartupMessageParams; -use std::{collections::HashSet, net::IpAddr, str::FromStr}; use thiserror::Error; use tracing::{info, warn}; +use crate::auth::password_hack::parse_endpoint_param; +use crate::context::RequestMonitoring; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, SniKind}; +use crate::proxy::NeonOptions; +use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::{EndpointId, RoleName}; + #[derive(Debug, Error, PartialEq, Eq, Clone)] pub(crate) enum ComputeUserInfoParseError { #[error("Parameter '{0}' is missing in startup packet.")] @@ -249,10 +251,11 @@ fn project_name_valid(name: &str) -> bool { #[cfg(test)] mod tests { - use super::*; use serde_json::json; use ComputeUserInfoParseError::*; + use super::*; + #[test] fn parse_bare_minimum() -> anyhow::Result<()> { // According to postgresql, only `user` should be required. diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index 9a5139dfb8..ccb17b66b9 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -1,21 +1,24 @@ //! Main authentication flow. -use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload}; -use crate::{ - config::TlsServerEndPoint, - context::RequestMonitoring, - control_plane::AuthSecret, - intern::EndpointIdInt, - sasl, - scram::{self, threadpool::ThreadPool}, - stream::{PqStream, Stream}, -}; +use std::io; +use std::sync::Arc; + use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be}; -use std::{io, sync::Arc}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::backend::ComputeCredentialKeys; +use super::{AuthErrorImpl, PasswordHackPayload}; +use crate::config::TlsServerEndPoint; +use crate::context::RequestMonitoring; +use crate::control_plane::AuthSecret; +use crate::intern::EndpointIdInt; +use crate::sasl; +use crate::scram::threadpool::ThreadPool; +use crate::scram::{self}; +use crate::stream::{PqStream, Stream}; + /// Every authentication selector is supposed to implement this trait. pub(crate) trait AuthMethod { /// Any authentication selector should provide initial backend message diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index 0c8686add2..ff97e6c35d 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -14,15 +14,15 @@ pub(crate) use password_hack::parse_endpoint_param; use password_hack::PasswordHackPayload; mod flow; +use std::io; +use std::net::IpAddr; + pub(crate) use flow::*; +use thiserror::Error; use tokio::time::error::Elapsed; -use crate::{ - control_plane, - error::{ReportableError, UserFacingError}, -}; -use std::{io, net::IpAddr}; -use thiserror::Error; +use crate::control_plane; +use crate::error::{ReportableError, UserFacingError}; /// Convenience wrapper for the authentication error. pub(crate) type Result = std::result::Result; diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index c92ebbc51f..e6bc369d9a 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -1,41 +1,43 @@ -use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration}; +use std::net::SocketAddr; +use std::pin::pin; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; use anyhow::{bail, ensure, Context}; use camino::{Utf8Path, Utf8PathBuf}; use compute_api::spec::LocalProxySpec; use dashmap::DashMap; use futures::future::Either; -use proxy::{ - auth::{ - self, - backend::{ - jwt::JwkCache, - local::{LocalBackend, JWKS_ROLE_MAP}, - }, - }, - cancellation::CancellationHandlerMain, - config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}, - control_plane::{ - locks::ApiLocks, - messages::{EndpointJwksResponse, JwksSettings}, - }, - http::health_server::AppMetrics, - intern::RoleNameInt, - metrics::{Metrics, ThreadPoolMetrics}, - rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo}, - scram::threadpool::ThreadPool, - serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions}, - RoleName, +use proxy::auth::backend::jwt::JwkCache; +use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP}; +use proxy::auth::{self}; +use proxy::cancellation::CancellationHandlerMain; +use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig}; +use proxy::control_plane::locks::ApiLocks; +use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings}; +use proxy::http::health_server::AppMetrics; +use proxy::intern::RoleNameInt; +use proxy::metrics::{Metrics, ThreadPoolMetrics}; +use proxy::rate_limiter::{ + BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, }; +use proxy::scram::threadpool::ThreadPool; +use proxy::serverless::cancel_set::CancelSet; +use proxy::serverless::{self, GlobalConnPoolOptions}; +use proxy::RoleName; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); use clap::Parser; -use tokio::{net::TcpListener, sync::Notify, task::JoinSet}; +use tokio::net::TcpListener; +use tokio::sync::Notify; +use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn}; -use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry}; +use utils::sentry_init::init_sentry; +use utils::{pid_file, project_build_tag, project_git_version}; #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 53f1586abe..00eb830d98 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -5,25 +5,23 @@ /// the outside. Similar to an ingress controller for HTTPS. use std::{net::SocketAddr, sync::Arc}; +use anyhow::{anyhow, bail, ensure, Context}; +use clap::Arg; use futures::future::Either; +use futures::TryFutureExt; use itertools::Itertools; use proxy::config::TlsServerEndPoint; use proxy::context::RequestMonitoring; use proxy::metrics::{Metrics, ThreadPoolMetrics}; use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource}; -use rustls::pki_types::PrivateKeyDer; -use tokio::net::TcpListener; - -use anyhow::{anyhow, bail, ensure, Context}; -use clap::Arg; -use futures::TryFutureExt; use proxy::stream::{PqStream, Stream}; - +use rustls::pki_types::PrivateKeyDer; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::TcpListener; use tokio_util::sync::CancellationToken; -use utils::{project_git_version, sentry_init::init_sentry}; - use tracing::{error, info, Instrument}; +use utils::project_git_version; +use utils::sentry_init::init_sentry; project_git_version!(GIT_VERSION); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 3c0e66dec3..96a71e69c6 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,3 +1,8 @@ +use std::net::SocketAddr; +use std::pin::pin; +use std::sync::Arc; + +use anyhow::bail; use aws_config::environment::EnvironmentVariableCredentialsProvider; use aws_config::imds::credentials::ImdsCredentialsProvider; use aws_config::meta::credentials::CredentialsProviderChain; @@ -7,52 +12,34 @@ use aws_config::provider_config::ProviderConfig; use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider; use aws_config::Region; use futures::future::Either; -use proxy::auth; use proxy::auth::backend::jwt::JwkCache; -use proxy::auth::backend::AuthRateLimiter; -use proxy::auth::backend::ConsoleRedirectBackend; -use proxy::auth::backend::MaybeOwned; -use proxy::cancellation::CancelMap; -use proxy::cancellation::CancellationHandler; -use proxy::config::remote_storage_from_toml; -use proxy::config::AuthenticationConfig; -use proxy::config::CacheOptions; -use proxy::config::HttpConfig; -use proxy::config::ProjectInfoCacheOptions; -use proxy::config::ProxyProtocolV2; +use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned}; +use proxy::cancellation::{CancelMap, CancellationHandler}; +use proxy::config::{ + self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig, + ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2, +}; use proxy::context::parquet::ParquetUploadArgs; -use proxy::control_plane; -use proxy::http; use proxy::http::health_server::AppMetrics; use proxy::metrics::Metrics; -use proxy::rate_limiter::EndpointRateLimiter; -use proxy::rate_limiter::LeakyBucketConfig; -use proxy::rate_limiter::RateBucketInfo; -use proxy::rate_limiter::WakeComputeRateLimiter; +use proxy::rate_limiter::{ + EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter, +}; use proxy::redis::cancellation_publisher::RedisPublisherClient; use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use proxy::redis::elasticache; -use proxy::redis::notifications; +use proxy::redis::{elasticache, notifications}; use proxy::scram::threadpool::ThreadPool; use proxy::serverless::cancel_set::CancelSet; use proxy::serverless::GlobalConnPoolOptions; -use proxy::usage_metrics; - -use anyhow::bail; -use proxy::config::{self, ProxyConfig}; -use proxy::serverless; +use proxy::{auth, control_plane, http, serverless, usage_metrics}; use remote_storage::RemoteStorageConfig; -use std::net::SocketAddr; -use std::pin::pin; -use std::sync::Arc; use tokio::net::TcpListener; use tokio::sync::Mutex; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; -use tracing::info; -use tracing::warn; -use tracing::Instrument; -use utils::{project_build_tag, project_git_version, sentry_init::init_sentry}; +use tracing::{info, warn, Instrument}; +use utils::sentry_init::init_sentry; +use utils::{project_build_tag, project_git_version}; project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs index 27121ce89e..82f3247fa7 100644 --- a/proxy/src/cache/endpoints.rs +++ b/proxy/src/cache/endpoints.rs @@ -1,31 +1,23 @@ -use std::{ - convert::Infallible, - sync::{ - atomic::{AtomicBool, Ordering}, - Arc, - }, - time::Duration, -}; +use std::convert::Infallible; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::time::Duration; use dashmap::DashSet; -use redis::{ - streams::{StreamReadOptions, StreamReadReply}, - AsyncCommands, FromRedisValue, Value, -}; +use redis::streams::{StreamReadOptions, StreamReadReply}; +use redis::{AsyncCommands, FromRedisValue, Value}; use serde::Deserialize; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; use tracing::info; -use crate::{ - config::EndpointCacheConfig, - context::RequestMonitoring, - intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, - rate_limiter::GlobalRateLimiter, - redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider, - EndpointId, -}; +use crate::config::EndpointCacheConfig; +use crate::context::RequestMonitoring; +use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; +use crate::rate_limiter::GlobalRateLimiter; +use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use crate::EndpointId; #[derive(Deserialize, Debug, Clone)] pub(crate) struct ControlPlaneEventKey { diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs index b92cedb043..31d1dc96e7 100644 --- a/proxy/src/cache/project_info.rs +++ b/proxy/src/cache/project_info.rs @@ -1,9 +1,8 @@ -use std::{ - collections::HashSet, - convert::Infallible, - sync::{atomic::AtomicU64, Arc}, - time::Duration, -}; +use std::collections::HashSet; +use std::convert::Infallible; +use std::sync::atomic::AtomicU64; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use dashmap::DashMap; @@ -13,15 +12,12 @@ use tokio::sync::Mutex; use tokio::time::Instant; use tracing::{debug, info}; -use crate::{ - auth::IpPattern, - config::ProjectInfoCacheOptions, - control_plane::AuthSecret, - intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, - EndpointId, RoleName, -}; - use super::{Cache, Cached}; +use crate::auth::IpPattern; +use crate::config::ProjectInfoCacheOptions; +use crate::control_plane::AuthSecret; +use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}; +use crate::{EndpointId, RoleName}; #[async_trait] pub(crate) trait ProjectInfoCache { @@ -371,7 +367,8 @@ impl Cache for ProjectInfoCacheImpl { #[cfg(test)] mod tests { use super::*; - use crate::{scram::ServerSecret, ProjectId}; + use crate::scram::ServerSecret; + use crate::ProjectId; #[tokio::test] async fn test_project_info_cache_settings() { diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 5b08d74696..06eaeb9a30 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -1,9 +1,6 @@ -use std::{ - borrow::Borrow, - hash::Hash, - time::{Duration, Instant}, -}; -use tracing::debug; +use std::borrow::Borrow; +use std::hash::Hash; +use std::time::{Duration, Instant}; // This seems to make more sense than `lru` or `cached`: // @@ -15,8 +12,10 @@ use tracing::debug; // // On the other hand, `hashlink` has good download stats and appears to be maintained. use hashlink::{linked_hash_map::RawEntryMut, LruCache}; +use tracing::debug; -use super::{common::Cached, timed_lru, Cache}; +use super::common::Cached; +use super::{timed_lru, Cache}; /// An implementation of timed LRU cache with fixed capacity. /// Key properties: diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 71a2a16af8..db0970adcb 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,6 +1,8 @@ +use std::net::SocketAddr; +use std::sync::Arc; + use dashmap::DashMap; use pq_proto::CancelKeyData; -use std::{net::SocketAddr, sync::Arc}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::Mutex; @@ -8,12 +10,10 @@ use tokio_postgres::{CancelToken, NoTls}; use tracing::info; use uuid::Uuid; -use crate::{ - error::ReportableError, - metrics::{CancellationRequest, CancellationSource, Metrics}, - redis::cancellation_publisher::{ - CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, - }, +use crate::error::ReportableError; +use crate::metrics::{CancellationRequest, CancellationSource, Metrics}; +use crate::redis::cancellation_publisher::{ + CancellationPublisher, CancellationPublisherMut, RedisPublisherClient, }; pub type CancelMap = Arc>>; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 006804fcd4..212e82497f 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,25 +1,31 @@ -use crate::{ - auth::parse_endpoint_param, - cancellation::CancelClosure, - context::RequestMonitoring, - control_plane::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError}, - error::{ReportableError, UserFacingError}, - metrics::{Metrics, NumDbConnectionsGuard}, - proxy::neon_option, - Host, -}; +use std::io; +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::Duration; + use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; -use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError}; -use std::{io, net::SocketAddr, sync::Arc, time::Duration}; +use rustls::client::danger::ServerCertVerifier; +use rustls::pki_types::InvalidDnsNameError; use thiserror::Error; use tokio::net::TcpStream; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{error, info, warn}; +use crate::auth::parse_endpoint_param; +use crate::cancellation::CancelClosure; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::ApiLockError; +use crate::error::{ReportableError, UserFacingError}; +use crate::metrics::{Metrics, NumDbConnectionsGuard}; +use crate::proxy::neon_option; +use crate::Host; + pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] diff --git a/proxy/src/config.rs b/proxy/src/config.rs index c068fc50fb..2ec8c7adda 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,29 +1,27 @@ -use crate::{ - auth::backend::{jwt::JwkCache, AuthRateLimiter}, - control_plane::locks::ApiLocks, - rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}, - scram::threadpool::ThreadPool, - serverless::{cancel_set::CancelSet, GlobalConnPoolOptions}, - Host, -}; +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::Arc; +use std::time::Duration; + use anyhow::{bail, ensure, Context, Ok}; use clap::ValueEnum; use itertools::Itertools; use remote_storage::RemoteStorageConfig; -use rustls::{ - crypto::ring::sign, - pki_types::{CertificateDer, PrivateKeyDer}, -}; +use rustls::crypto::ring::sign; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; use sha2::{Digest, Sha256}; -use std::{ - collections::{HashMap, HashSet}, - str::FromStr, - sync::Arc, - time::Duration, -}; use tracing::{error, info}; use x509_parser::oid_registry; +use crate::auth::backend::jwt::JwkCache; +use crate::auth::backend::AuthRateLimiter; +use crate::control_plane::locks::ApiLocks; +use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}; +use crate::scram::threadpool::ThreadPool; +use crate::serverless::cancel_set::CancelSet; +use crate::serverless::GlobalConnPoolOptions; +use crate::Host; + pub struct ProxyConfig { pub tls_config: Option, pub metric_collection: Option, @@ -692,9 +690,8 @@ impl FromStr for ConcurrencyLockOptions { #[cfg(test)] mod tests { - use crate::rate_limiter::Aimd; - use super::*; + use crate::rate_limiter::Aimd; #[test] fn test_parse_cache_options() -> anyhow::Result<()> { diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 9e17976720..81d1d70958 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -1,25 +1,22 @@ -use crate::auth::backend::ConsoleRedirectBackend; -use crate::config::{ProxyConfig, ProxyProtocolV2}; -use crate::proxy::{ - prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, -}; -use crate::{ - cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}, - context::RequestMonitoring, - error::ReportableError, - metrics::{Metrics, NumClientConnectionsGuard}, - protocol2::read_proxy_protocol, - proxy::handshake::{handshake, HandshakeData}, -}; -use futures::TryFutureExt; use std::sync::Arc; + +use futures::TryFutureExt; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; use tracing::{error, info, Instrument}; +use crate::auth::backend::ConsoleRedirectBackend; +use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism}; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::proxy::passthrough::ProxyPassthrough; use crate::proxy::{ - connect_compute::{connect_to_compute, TcpMechanism}, - passthrough::ProxyPassthrough, + prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource, }; pub async fn task_main( diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 7fb4e7c698..e2d2c1b766 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -1,24 +1,25 @@ //! Connection request monitoring contexts +use std::net::IpAddr; + use chrono::Utc; use once_cell::sync::OnceCell; use pq_proto::StartupMessageParams; use smol_str::SmolStr; -use std::net::IpAddr; use tokio::sync::mpsc; -use tracing::{debug, field::display, info, info_span, Span}; +use tracing::field::display; +use tracing::{debug, info, info_span, Span}; use try_lock::TryLock; use uuid::Uuid; -use crate::{ - control_plane::messages::{ColdStartInfo, MetricsAuxInfo}, - error::ErrorKind, - intern::{BranchIdInt, ProjectIdInt}, - metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting}, - DbName, EndpointId, RoleName, -}; - use self::parquet::RequestData; +use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::error::ErrorKind; +use crate::intern::{BranchIdInt, ProjectIdInt}; +use crate::metrics::{ + ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting, +}; +use crate::{DbName, EndpointId, RoleName}; pub mod parquet; diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 9f6f83022e..b0ad0e4566 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -1,29 +1,28 @@ -use std::{sync::Arc, time::SystemTime}; +use std::sync::Arc; +use std::time::SystemTime; use anyhow::Context; -use bytes::{buf::Writer, BufMut, BytesMut}; +use bytes::buf::Writer; +use bytes::{BufMut, BytesMut}; use chrono::{Datelike, Timelike}; use futures::{Stream, StreamExt}; -use parquet::{ - basic::Compression, - file::{ - metadata::RowGroupMetaDataPtr, - properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}, - writer::SerializedFileWriter, - }, - record::RecordWriter, -}; +use parquet::basic::Compression; +use parquet::file::metadata::RowGroupMetaDataPtr; +use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE}; +use parquet::file::writer::SerializedFileWriter; +use parquet::record::RecordWriter; use pq_proto::StartupMessageParams; use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; -use tokio::{sync::mpsc, time}; +use tokio::sync::mpsc; +use tokio::time; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; - use super::{RequestMonitoringInner, LOG_CHAN}; +use crate::config::remote_storage_from_toml; +use crate::context::LOG_CHAN_DISCONNECT; #[derive(clap::Args, Clone, Debug)] pub struct ParquetUploadArgs { @@ -407,26 +406,26 @@ async fn upload_parquet( #[cfg(test)] mod tests { - use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc}; + use std::net::Ipv4Addr; + use std::num::NonZeroUsize; + use std::sync::Arc; use camino::Utf8Path; use clap::Parser; use futures::{Stream, StreamExt}; use itertools::Itertools; - use parquet::{ - basic::{Compression, ZstdLevel}, - file::{ - properties::{WriterProperties, DEFAULT_PAGE_SIZE}, - reader::FileReader, - serialized_reader::SerializedFileReader, - }, - }; - use rand::{rngs::StdRng, Rng, SeedableRng}; + use parquet::basic::{Compression, ZstdLevel}; + use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE}; + use parquet::file::reader::FileReader; + use parquet::file::serialized_reader::SerializedFileReader; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use remote_storage::{ GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; - use tokio::{sync::mpsc, time}; + use tokio::sync::mpsc; + use tokio::time; use walkdir::WalkDir; use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData}; diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs index 960bb5bc21..dae23f7c53 100644 --- a/proxy/src/control_plane/messages.rs +++ b/proxy/src/control_plane/messages.rs @@ -1,9 +1,9 @@ -use measured::FixedCardinalityLabel; -use serde::{Deserialize, Serialize}; use std::fmt::{self, Display}; -use crate::auth::IpPattern; +use measured::FixedCardinalityLabel; +use serde::{Deserialize, Serialize}; +use crate::auth::IpPattern; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt}; use crate::proxy::retry::CouldRetry; @@ -362,9 +362,10 @@ pub struct JwksSettings { #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + fn dummy_aux() -> serde_json::Value { json!({ "endpoint_id": "endpoint", diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs index 2c4b5a9b94..5ac3acd28a 100644 --- a/proxy/src/control_plane/mgmt.rs +++ b/proxy/src/control_plane/mgmt.rs @@ -1,16 +1,16 @@ -use crate::{ - control_plane::messages::{DatabaseInfo, KickSession}, - waiters::{self, Waiter, Waiters}, -}; +use std::convert::Infallible; + use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::convert::Infallible; use tokio::net::{TcpListener, TcpStream}; use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; +use crate::control_plane::messages::{DatabaseInfo, KickSession}; +use crate::waiters::{self, Waiter, Waiters}; + static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); /// Give caller an opportunity to wait for the cloud's reply. diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs index 51cddec672..fb061376e7 100644 --- a/proxy/src/control_plane/provider/mock.rs +++ b/proxy/src/control_plane/provider/mock.rs @@ -1,28 +1,29 @@ //! Mock console backend which relies on a user-provided postgres instance. -use super::{ - errors::{ApiError, GetAuthInfoError, WakeComputeError}, - AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, -}; -use crate::{ - auth::backend::jwt::AuthRule, context::RequestMonitoring, - control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName, -}; -use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; -use crate::{auth::IpPattern, cache::Cached}; -use crate::{ - control_plane::{ - messages::MetricsAuxInfo, - provider::{CachedAllowedIps, CachedRoleSecret}, - }, - BranchId, EndpointId, ProjectId, -}; +use std::str::FromStr; +use std::sync::Arc; + use futures::TryFutureExt; -use std::{str::FromStr, sync::Arc}; use thiserror::Error; -use tokio_postgres::{config::SslMode, Client}; +use tokio_postgres::config::SslMode; +use tokio_postgres::Client; use tracing::{error, info, info_span, warn, Instrument}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; +use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::auth::IpPattern; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret}; +use crate::error::io_error; +use crate::intern::RoleNameInt; +use crate::url::ApiUrl; +use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName}; + #[derive(Debug, Error)] enum MockApiError { #[error("Failed to read password: {0}")] diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs index 0a196fe2a3..a4a330cd5f 100644 --- a/proxy/src/control_plane/provider/mod.rs +++ b/proxy/src/control_plane/provider/mod.rs @@ -2,39 +2,36 @@ pub mod mock; pub mod neon; -use super::messages::{ControlPlaneError, MetricsAuxInfo}; -use crate::{ - auth::{ - backend::{ - jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}, - ComputeCredentialKeys, ComputeUserInfo, - }, - IpPattern, - }, - cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru}, - compute, - config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}, - context::RequestMonitoring, - error::ReportableError, - intern::ProjectIdInt, - metrics::ApiLockMetrics, - rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}, - scram, EndpointCacheKey, EndpointId, -}; +use std::hash::Hash; +use std::sync::Arc; +use std::time::Duration; + use dashmap::DashMap; -use std::{hash::Hash, sync::Arc, time::Duration}; use tokio::time::Instant; use tracing::info; +use super::messages::{ControlPlaneError, MetricsAuxInfo}; +use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::IpPattern; +use crate::cache::endpoints::EndpointsCache; +use crate::cache::project_info::ProjectInfoCacheImpl; +use crate::cache::{Cached, TimedLru}; +use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::intern::ProjectIdInt; +use crate::metrics::ApiLockMetrics; +use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token}; +use crate::{compute, scram, EndpointCacheKey, EndpointId}; + pub(crate) mod errors { - use crate::{ - control_plane::messages::{self, ControlPlaneError, Reason}, - error::{io_error, ErrorKind, ReportableError, UserFacingError}, - proxy::retry::CouldRetry, - }; use thiserror::Error; use super::ApiLockError; + use crate::control_plane::messages::{self, ControlPlaneError, Reason}; + use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError}; + use crate::proxy::retry::CouldRetry; /// A go-to error message which doesn't leak any detail. pub(crate) const REQUEST_FAILED: &str = "Console request failed"; diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs index 2487ce0e3f..5d0692c7ca 100644 --- a/proxy/src/control_plane/provider/neon.rs +++ b/proxy/src/control_plane/provider/neon.rs @@ -1,31 +1,31 @@ //! Production console backend. -use super::{ - super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}, - errors::{ApiError, GetAuthInfoError, WakeComputeError}, - ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, - NodeInfo, -}; -use crate::{ - auth::backend::{jwt::AuthRule, ComputeUserInfo}, - compute, - control_plane::{ - errors::GetEndpointJwksError, - messages::{ColdStartInfo, EndpointJwksResponse, Reason}, - }, - http, - metrics::{CacheOutcome, Metrics}, - rate_limiter::WakeComputeRateLimiter, - scram, EndpointCacheKey, EndpointId, -}; -use crate::{cache::Cached, context::RequestMonitoring}; -use ::http::{header::AUTHORIZATION, HeaderName}; +use std::sync::Arc; +use std::time::Duration; + +use ::http::header::AUTHORIZATION; +use ::http::HeaderName; use futures::TryFutureExt; -use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; use tracing::{debug, info, info_span, warn, Instrument}; +use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute}; +use super::errors::{ApiError, GetAuthInfoError, WakeComputeError}; +use super::{ + ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, + NodeInfo, +}; +use crate::auth::backend::jwt::AuthRule; +use crate::auth::backend::ComputeUserInfo; +use crate::cache::Cached; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::GetEndpointJwksError; +use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason}; +use crate::metrics::{CacheOutcome, Metrics}; +use crate::rate_limiter::WakeComputeRateLimiter; +use crate::{compute, http, scram, EndpointCacheKey, EndpointId}; + const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id"); #[derive(Clone)] diff --git a/proxy/src/error.rs b/proxy/src/error.rs index 1cd4dc2c22..e71ed0c048 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -1,4 +1,5 @@ -use std::{error::Error as StdError, fmt, io}; +use std::error::Error as StdError; +use std::{fmt, io}; use measured::FixedCardinalityLabel; diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index d0352351d5..978ad9f761 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -1,19 +1,18 @@ +use std::convert::Infallible; +use std::net::TcpListener; +use std::sync::{Arc, Mutex}; + use anyhow::{anyhow, bail}; -use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode}; -use measured::{text::BufferedTextEncoder, MetricGroup}; +use hyper0::header::CONTENT_TYPE; +use hyper0::{Body, Request, Response, StatusCode}; +use measured::text::BufferedTextEncoder; +use measured::MetricGroup; use metrics::NeonMetrics; -use std::{ - convert::Infallible, - net::TcpListener, - sync::{Arc, Mutex}, -}; use tracing::{info, info_span}; -use utils::http::{ - endpoint::{self, request_span}, - error::ApiError, - json::json_response, - RouterBuilder, RouterService, -}; +use utils::http::endpoint::{self, request_span}; +use utils::http::error::ApiError; +use utils::http::json::json_response; +use utils::http::{RouterBuilder, RouterService}; use crate::jemalloc; diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs index d8676d5b50..fd587e8f01 100644 --- a/proxy/src/http/mod.rs +++ b/proxy/src/http/mod.rs @@ -10,17 +10,15 @@ use anyhow::bail; use bytes::Bytes; use http_body_util::BodyExt; use hyper::body::Body; +pub(crate) use reqwest::{Request, Response}; +use reqwest_middleware::RequestBuilder; +pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; +pub(crate) use reqwest_retry::policies::ExponentialBackoff; +pub(crate) use reqwest_retry::RetryTransientMiddleware; use serde::de::DeserializeOwned; -pub(crate) use reqwest::{Request, Response}; -pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error}; -pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; - -use crate::{ - metrics::{ConsoleRequest, Metrics}, - url::ApiUrl, -}; -use reqwest_middleware::RequestBuilder; +use crate::metrics::{ConsoleRequest, Metrics}; +use crate::url::ApiUrl; /// This is the preferred way to create new http clients, /// because it takes care of observability (OpenTelemetry). @@ -142,9 +140,10 @@ pub(crate) async fn parse_json_body_with_limit( #[cfg(test)] mod tests { - use super::*; use reqwest::Client; + use super::*; + #[test] fn optional_query_params() -> anyhow::Result<()> { let url = "http://example.com".parse()?; diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs index 108420d7d7..09fd9657d0 100644 --- a/proxy/src/intern.rs +++ b/proxy/src/intern.rs @@ -1,6 +1,8 @@ -use std::{ - hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock, -}; +use std::hash::BuildHasherDefault; +use std::marker::PhantomData; +use std::num::NonZeroUsize; +use std::ops::Index; +use std::sync::OnceLock; use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo}; use rustc_hash::FxHasher; @@ -208,9 +210,8 @@ impl From for ProjectIdInt { mod tests { use std::sync::OnceLock; - use crate::intern::StringInterner; - use super::InternId; + use crate::intern::StringInterner; struct MyId; impl InternId for MyId { @@ -222,7 +223,8 @@ mod tests { #[test] fn push_many_strings() { - use rand::{rngs::StdRng, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; use rand_distr::Zipf; let endpoint_dist = Zipf::new(500000, 0.8).unwrap(); diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index d307d80f4a..0fae78b60c 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -1,14 +1,12 @@ use std::marker::PhantomData; -use measured::{ - label::NoLabels, - metric::{ - gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding, - MetricFamilyEncoding, MetricType, - }, - text::TextEncoder, - LabelGroup, MetricGroup, -}; +use measured::label::NoLabels; +use measured::metric::gauge::GaugeState; +use measured::metric::group::Encoding; +use measured::metric::name::MetricNameEncoder; +use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType}; +use measured::text::TextEncoder; +use measured::{LabelGroup, MetricGroup}; use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version}; pub struct MetricRecorder { diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index a34eb820f8..11921867e4 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,14 +1,10 @@ use tracing::Subscriber; -use tracing_subscriber::{ - filter::{EnvFilter, LevelFilter}, - fmt::{ - format::{Format, Full}, - time::SystemTime, - FormatEvent, FormatFields, - }, - prelude::*, - registry::LookupSpan, -}; +use tracing_subscriber::filter::{EnvFilter, LevelFilter}; +use tracing_subscriber::fmt::format::{Format, Full}; +use tracing_subscriber::fmt::time::SystemTime; +use tracing_subscriber::fmt::{FormatEvent, FormatFields}; +use tracing_subscriber::prelude::*; +use tracing_subscriber::registry::LookupSpan; /// Initialize logging and OpenTelemetry tracing and exporter. /// diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index 272723a1bc..542826e833 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -1,14 +1,16 @@ use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; +use measured::label::{ + FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet, +}; +use measured::metric::histogram::Thresholds; +use measured::metric::name::MetricName; use measured::{ - label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, - metric::{histogram::Thresholds, name::MetricName}, Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; - use tokio::time::{self, Instant}; use crate::control_plane::messages::ColdStartInfo; diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs index 17764f78d1..ef2391cdd8 100644 --- a/proxy/src/protocol2.rs +++ b/proxy/src/protocol2.rs @@ -1,11 +1,9 @@ //! Proxy Protocol V2 implementation -use std::{ - io, - net::SocketAddr, - pin::Pin, - task::{Context, Poll}, -}; +use std::io; +use std::net::SocketAddr; +use std::pin::Pin; +use std::task::{Context, Poll}; use bytes::BytesMut; use pin_project_lite::pin_project; diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index aac7720890..8e9663626a 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,24 +1,23 @@ -use crate::{ - auth::backend::ComputeCredentialKeys, - compute::COULD_NOT_CONNECT, - compute::{self, PostgresConnection}, - config::RetryConfig, - context::RequestMonitoring, - control_plane::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, - error::ReportableError, - metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType}, - proxy::{ - retry::{retry_after, should_retry, CouldRetry}, - wake_compute::wake_compute, - }, - Host, -}; use async_trait::async_trait; use pq_proto::StartupMessageParams; use tokio::time; use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; +use crate::auth::backend::ComputeCredentialKeys; +use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT}; +use crate::config::RetryConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; +use crate::error::ReportableError; +use crate::metrics::{ + ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType, +}; +use crate::proxy::retry::{retry_after, should_retry, CouldRetry}; +use crate::proxy::wake_compute::wake_compute; +use crate::Host; const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs index 4ebda013ac..91a3ceff75 100644 --- a/proxy/src/proxy/copy_bidirectional.rs +++ b/proxy/src/proxy/copy_bidirectional.rs @@ -1,11 +1,11 @@ -use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; -use tracing::info; - use std::future::poll_fn; use std::io; use std::pin::Pin; use std::task::{ready, Context, Poll}; +use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; +use tracing::info; + #[derive(Debug)] enum TransferState { Running(CopyBuffer), @@ -256,9 +256,10 @@ impl CopyBuffer { #[cfg(test)] mod tests { - use super::*; use tokio::io::AsyncWriteExt; + use super::*; + #[tokio::test] async fn test_client_to_compute() { let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index 5996b11c11..a67f1b8112 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -1,21 +1,19 @@ use bytes::Buf; +use pq_proto::framed::Framed; use pq_proto::{ - framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, - StartupMessageParams, + BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams, }; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::{info, warn}; -use crate::{ - auth::endpoint_sni, - config::{TlsConfig, PG_ALPN_PROTOCOL}, - context::RequestMonitoring, - error::ReportableError, - metrics::Metrics, - proxy::ERR_INSECURE_CONNECTION, - stream::{PqStream, Stream, StreamUpgradeError}, -}; +use crate::auth::endpoint_sni; +use crate::config::{TlsConfig, PG_ALPN_PROTOCOL}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::Metrics; +use crate::proxy::ERR_INSECURE_CONNECTION; +use crate::stream::{PqStream, Stream, StreamUpgradeError}; #[derive(Error, Debug)] pub(crate) enum HandshakeError { diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index b2b5a7f43d..f646862caa 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -7,40 +7,32 @@ pub(crate) mod handshake; pub(crate) mod passthrough; pub(crate) mod retry; pub(crate) mod wake_compute; -pub use copy_bidirectional::copy_bidirectional_client_compute; -pub use copy_bidirectional::ErrorSource; +use std::sync::Arc; -use crate::config::ProxyProtocolV2; -use crate::{ - auth, - cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}, - compute, - config::{ProxyConfig, TlsConfig}, - context::RequestMonitoring, - error::ReportableError, - metrics::{Metrics, NumClientConnectionsGuard}, - protocol2::read_proxy_protocol, - proxy::handshake::{handshake, HandshakeData}, - rate_limiter::EndpointRateLimiter, - stream::{PqStream, Stream}, - EndpointCacheKey, -}; +pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource}; use futures::TryFutureExt; use itertools::Itertools; use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, StartupMessageParams}; use regex::Regex; use smol_str::{format_smolstr, SmolStr}; -use std::sync::Arc; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; use tracing::{error, info, warn, Instrument}; -use self::{ - connect_compute::{connect_to_compute, TcpMechanism}, - passthrough::ProxyPassthrough, -}; +use self::connect_compute::{connect_to_compute, TcpMechanism}; +use self::passthrough::ProxyPassthrough; +use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal}; +use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig}; +use crate::context::RequestMonitoring; +use crate::error::ReportableError; +use crate::metrics::{Metrics, NumClientConnectionsGuard}; +use crate::protocol2::read_proxy_protocol; +use crate::proxy::handshake::{handshake, HandshakeData}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::stream::{PqStream, Stream}; +use crate::{auth, compute, EndpointCacheKey}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index 497cf4bfd5..e3b4730982 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -1,16 +1,14 @@ -use crate::{ - cancellation, - compute::PostgresConnection, - control_plane::messages::MetricsAuxInfo, - metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}, - stream::Stream, - usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}, -}; use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use utils::measured_stream::MeasuredStream; use super::copy_bidirectional::ErrorSource; +use crate::cancellation; +use crate::compute::PostgresConnection; +use crate::control_plane::messages::MetricsAuxInfo; +use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard}; +use crate::stream::Stream; +use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS}; /// Forward bytes in both directions (client <-> compute). #[tracing::instrument(skip_all)] diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs index 15895d37e6..d3f0c3e7d4 100644 --- a/proxy/src/proxy/retry.rs +++ b/proxy/src/proxy/retry.rs @@ -1,7 +1,11 @@ -use crate::{compute, config::RetryConfig}; -use std::{error::Error, io}; +use std::error::Error; +use std::io; + use tokio::time; +use crate::compute; +use crate::config::RetryConfig; + pub(crate) trait CouldRetry { /// Returns true if the error could be retried fn could_retry(&self) -> bool; diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 33a2162bc7..df9f79a7e3 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -6,7 +6,6 @@ use std::fmt::Debug; -use super::*; use bytes::{Bytes, BytesMut}; use futures::{SinkExt, StreamExt}; use postgres_protocol::message::frontend; @@ -14,6 +13,8 @@ use tokio::io::{AsyncReadExt, DuplexStream}; use tokio_postgres::tls::TlsConnect; use tokio_util::codec::{Decoder, Encoder}; +use super::*; + enum Intercept { None, Methods, diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index deb4d4a63f..e50ae4bc93 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -4,6 +4,16 @@ mod mitm; use std::time::Duration; +use anyhow::{bail, Context}; +use async_trait::async_trait; +use http::StatusCode; +use retry::{retry_after, ShouldRetryWakeCompute}; +use rstest::rstest; +use rustls::pki_types; +use tokio_postgres::config::SslMode; +use tokio_postgres::tls::{MakeTlsConnect, NoTls}; +use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; + use super::connect_compute::ConnectMechanism; use super::retry::CouldRetry; use super::*; @@ -18,15 +28,6 @@ use crate::control_plane::provider::{ use crate::control_plane::{self, CachedNodeInfo, NodeInfo}; use crate::error::ErrorKind; use crate::{sasl, scram, BranchId, EndpointId, ProjectId}; -use anyhow::{bail, Context}; -use async_trait::async_trait; -use http::StatusCode; -use retry::{retry_after, ShouldRetryWakeCompute}; -use rstest::rstest; -use rustls::pki_types; -use tokio_postgres::config::SslMode; -use tokio_postgres::tls::{MakeTlsConnect, NoTls}; -use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream}; /// Generate a set of TLS certificates: CA + server. fn generate_certs( @@ -336,7 +337,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> { generate_tls_config("generic-project-name.localhost", "localhost")?; let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock())); - use rand::{distributions::Alphanumeric, Rng}; + use rand::distributions::Alphanumeric; + use rand::Rng; let password: String = rand::thread_rng() .sample_iter(&Alphanumeric) .take(rand::random::() as usize) diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs index 0d1527a2c1..9dfa485fa4 100644 --- a/proxy/src/proxy/wake_compute.rs +++ b/proxy/src/proxy/wake_compute.rs @@ -1,16 +1,17 @@ +use hyper::StatusCode; +use tracing::{error, info, warn}; + +use super::connect_compute::ComputeConnectBackend; use crate::config::RetryConfig; use crate::context::RequestMonitoring; +use crate::control_plane::errors::WakeComputeError; use crate::control_plane::messages::{ControlPlaneError, Reason}; -use crate::control_plane::{errors::WakeComputeError, provider::CachedNodeInfo}; +use crate::control_plane::provider::CachedNodeInfo; use crate::metrics::{ ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType, WakeupFailureKind, }; use crate::proxy::retry::{retry_after, should_retry}; -use hyper::StatusCode; -use tracing::{error, info, warn}; - -use super::connect_compute::ComputeConnectBackend; pub(crate) async fn wake_compute( num_retries: &mut u32, diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index bf4d85f2e4..45f9630dde 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -1,7 +1,5 @@ -use std::{ - hash::Hash, - sync::atomic::{AtomicUsize, Ordering}, -}; +use std::hash::Hash; +use std::sync::atomic::{AtomicUsize, Ordering}; use ahash::RandomState; use dashmap::DashMap; diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs index 25607b7e10..16c398f303 100644 --- a/proxy/src/rate_limiter/limit_algorithm.rs +++ b/proxy/src/rate_limiter/limit_algorithm.rs @@ -1,10 +1,12 @@ //! Algorithms for controlling concurrency limits. +use std::pin::pin; +use std::sync::Arc; +use std::time::Duration; + use parking_lot::Mutex; -use std::{pin::pin, sync::Arc, time::Duration}; -use tokio::{ - sync::Notify, - time::{error::Elapsed, Instant}, -}; +use tokio::sync::Notify; +use tokio::time::error::Elapsed; +use tokio::time::Instant; use self::aimd::Aimd; diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs index 86b56e38fb..5332a5184f 100644 --- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs +++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs @@ -60,12 +60,11 @@ impl LimitAlgorithm for Aimd { mod tests { use std::time::Duration; + use super::*; use crate::rate_limiter::limit_algorithm::{ DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig, }; - use super::*; - #[tokio::test(start_paused = true)] async fn increase_decrease() { let config = RateLimiterConfig { diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index be529f174d..5de64c2254 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -1,17 +1,14 @@ -use std::{ - borrow::Cow, - collections::hash_map::RandomState, - hash::{BuildHasher, Hash}, - sync::{ - atomic::{AtomicUsize, Ordering}, - Mutex, - }, -}; +use std::borrow::Cow; +use std::collections::hash_map::RandomState; +use std::hash::{BuildHasher, Hash}; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Mutex; use anyhow::bail; use dashmap::DashMap; use itertools::Itertools; -use rand::{rngs::StdRng, Rng, SeedableRng}; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; use tokio::time::{Duration, Instant}; use tracing::info; @@ -243,14 +240,17 @@ impl BucketRateLimiter { #[cfg(test)] mod tests { - use std::{hash::BuildHasherDefault, time::Duration}; + use std::hash::BuildHasherDefault; + use std::time::Duration; use rand::SeedableRng; use rustc_hash::FxHasher; use tokio::time; use super::{BucketRateLimiter, WakeComputeRateLimiter}; - use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId}; + use crate::intern::EndpointIdInt; + use crate::rate_limiter::RateBucketInfo; + use crate::EndpointId; #[test] fn rate_bucket_rpi() { diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs index 6e38f89458..3ae2ecaf8f 100644 --- a/proxy/src/rate_limiter/mod.rs +++ b/proxy/src/rate_limiter/mod.rs @@ -2,13 +2,11 @@ mod leaky_bucket; mod limit_algorithm; mod limiter; +pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; #[cfg(test)] pub(crate) use limit_algorithm::aimd::Aimd; - pub(crate) use limit_algorithm::{ DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token, }; pub(crate) use limiter::GlobalRateLimiter; - -pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs index 95bdfc0965..0000246971 100644 --- a/proxy/src/redis/cancellation_publisher.rs +++ b/proxy/src/redis/cancellation_publisher.rs @@ -5,13 +5,10 @@ use redis::AsyncCommands; use tokio::sync::Mutex; use uuid::Uuid; +use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; +use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}; use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; -use super::{ - connection_with_credentials_provider::ConnectionWithCredentialsProvider, - notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME}, -}; - pub trait CancellationPublisherMut: Send + Sync + 'static { #[allow(async_fn_in_trait)] async fn try_publish( diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs index ccd48f1481..82139ea1d5 100644 --- a/proxy/src/redis/connection_with_credentials_provider.rs +++ b/proxy/src/redis/connection_with_credentials_provider.rs @@ -1,10 +1,9 @@ -use std::{sync::Arc, time::Duration}; +use std::sync::Arc; +use std::time::Duration; use futures::FutureExt; -use redis::{ - aio::{ConnectionLike, MultiplexedConnection}, - ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult, -}; +use redis::aio::{ConnectionLike, MultiplexedConnection}; +use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult}; use tokio::task::JoinHandle; use tracing::{debug, error, info, warn}; diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs index c3af6740cb..e56c5a3414 100644 --- a/proxy/src/redis/notifications.rs +++ b/proxy/src/redis/notifications.rs @@ -1,4 +1,5 @@ -use std::{convert::Infallible, sync::Arc}; +use std::convert::Infallible; +use std::sync::Arc; use futures::StreamExt; use pq_proto::CancelKeyData; @@ -8,12 +9,10 @@ use tokio_util::sync::CancellationToken; use uuid::Uuid; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; -use crate::{ - cache::project_info::ProjectInfoCache, - cancellation::{CancelMap, CancellationHandler}, - intern::{ProjectIdInt, RoleNameInt}, - metrics::{Metrics, RedisErrors, RedisEventsCount}, -}; +use crate::cache::project_info::ProjectInfoCache; +use crate::cancellation::{CancelMap, CancellationHandler}; +use crate::intern::{ProjectIdInt, RoleNameInt}; +use crate::metrics::{Metrics, RedisErrors, RedisEventsCount}; const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates"; pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates"; @@ -269,10 +268,10 @@ where #[cfg(test)] mod tests { - use crate::{ProjectId, RoleName}; + use serde_json::json; use super::*; - use serde_json::json; + use crate::{ProjectId, RoleName}; #[test] fn parse_allowed_ips() -> anyhow::Result<()> { diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs index 6c9a42b2db..1373dfba3d 100644 --- a/proxy/src/sasl/messages.rs +++ b/proxy/src/sasl/messages.rs @@ -1,8 +1,9 @@ //! Definitions for SASL messages. -use crate::parse::{split_at_const, split_cstr}; use pq_proto::{BeAuthenticationSaslMessage, BeMessage}; +use crate::parse::{split_at_const, split_cstr}; + /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage). #[derive(Debug)] pub(crate) struct FirstMessage<'a> { diff --git a/proxy/src/sasl/mod.rs b/proxy/src/sasl/mod.rs index 0a36694359..f0181b404f 100644 --- a/proxy/src/sasl/mod.rs +++ b/proxy/src/sasl/mod.rs @@ -10,13 +10,14 @@ mod channel_binding; mod messages; mod stream; -use crate::error::{ReportableError, UserFacingError}; use std::io; -use thiserror::Error; pub(crate) use channel_binding::ChannelBinding; pub(crate) use messages::FirstMessage; pub(crate) use stream::{Outcome, SaslStream}; +use thiserror::Error; + +use crate::error::{ReportableError, UserFacingError}; /// Fine-grained auth errors help in writing tests. #[derive(Error, Debug)] diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs index b6becd28e1..f1c916daa2 100644 --- a/proxy/src/sasl/stream.rs +++ b/proxy/src/sasl/stream.rs @@ -1,11 +1,14 @@ //! Abstraction for the string-oriented SASL protocols. -use super::{messages::ServerMessage, Mechanism}; -use crate::stream::PqStream; use std::io; + use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; +use super::messages::ServerMessage; +use super::Mechanism; +use crate::stream::PqStream; + /// Abstracts away all peculiarities of the libpq's protocol. pub(crate) struct SaslStream<'a, S> { /// The underlying stream. diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 64ee0135e1..87ab6e0d5f 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -69,7 +69,9 @@ impl CountMinSketch { #[cfg(test)] mod tests { - use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng}; + use rand::rngs::StdRng; + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; use super::CountMinSketch; diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index afb5604666..493295c938 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -209,7 +209,8 @@ impl sasl::Mechanism for Exchange<'_> { type Output = super::ScramKey; fn exchange(mut self, input: &str) -> sasl::Result> { - use {sasl::Step, ExchangeState}; + use sasl::Step; + use ExchangeState; match &self.state { ExchangeState::Initial(init) => { match init.transition(self.secret, &self.tls_server_end_point, input)? { diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs index fd9e77764c..5ee3a51352 100644 --- a/proxy/src/scram/messages.rs +++ b/proxy/src/scram/messages.rs @@ -1,11 +1,12 @@ //! Definitions for SCRAM messages. +use std::fmt; +use std::ops::Range; + use super::base64_decode_array; use super::key::{ScramKey, SCRAM_KEY_LEN}; use super::signature::SignatureBuilder; use crate::sasl::ChannelBinding; -use std::fmt; -use std::ops::Range; /// Faithfully taken from PostgreSQL. pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18; diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs index d058f1c3f8..97644b6282 100644 --- a/proxy/src/scram/mod.rs +++ b/proxy/src/scram/mod.rs @@ -16,10 +16,9 @@ mod signature; pub mod threadpool; pub(crate) use exchange::{exchange, Exchange}; +use hmac::{Hmac, Mac}; pub(crate) use key::ScramKey; pub(crate) use secret::ServerSecret; - -use hmac::{Hmac, Mac}; use sha2::{Digest, Sha256}; const SCRAM_SHA_256: &str = "SCRAM-SHA-256"; @@ -59,13 +58,11 @@ fn sha256<'a>(parts: impl IntoIterator) -> [u8; 32] { #[cfg(test)] mod tests { - use crate::{ - intern::EndpointIdInt, - sasl::{Mechanism, Step}, - EndpointId, - }; - - use super::{threadpool::ThreadPool, Exchange, ServerSecret}; + use super::threadpool::ThreadPool; + use super::{Exchange, ServerSecret}; + use crate::intern::EndpointIdInt; + use crate::sasl::{Mechanism, Step}; + use crate::EndpointId; #[test] fn snapshot() { diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index 4cf76c8452..9c559e9082 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -1,7 +1,6 @@ -use hmac::{ - digest::{consts::U32, generic_array::GenericArray}, - Hmac, Mac, -}; +use hmac::digest::consts::U32; +use hmac::digest::generic_array::GenericArray; +use hmac::{Hmac, Mac}; use sha2::Sha256; pub(crate) struct Pbkdf2 { @@ -66,10 +65,11 @@ impl Pbkdf2 { #[cfg(test)] mod tests { - use super::Pbkdf2; use pbkdf2::pbkdf2_hmac_array; use sha2::Sha256; + use super::Pbkdf2; + #[test] fn works() { let salt = b"sodium chloride"; diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index c027a0cd20..cc1b69fcf9 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -4,28 +4,21 @@ //! 1. Fairness per endpoint. //! 2. Yield support for high iteration counts. -use std::{ - cell::RefCell, - future::Future, - pin::Pin, - sync::{ - atomic::{AtomicUsize, Ordering}, - Arc, Weak, - }, - task::{Context, Poll}, -}; +use std::cell::RefCell; +use std::future::Future; +use std::pin::Pin; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Weak}; +use std::task::{Context, Poll}; use futures::FutureExt; -use rand::Rng; -use rand::{rngs::SmallRng, SeedableRng}; - -use crate::{ - intern::EndpointIdInt, - metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}, - scram::countmin::CountMinSketch, -}; +use rand::rngs::SmallRng; +use rand::{Rng, SeedableRng}; use super::pbkdf2::Pbkdf2; +use crate::intern::EndpointIdInt; +use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId}; +use crate::scram::countmin::CountMinSketch; pub struct ThreadPool { runtime: Option, @@ -195,9 +188,8 @@ impl Drop for JobHandle { #[cfg(test)] mod tests { - use crate::EndpointId; - use super::*; + use crate::EndpointId; #[tokio::test] async fn hash_is_correct() { diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 927854897f..a180c4c2ed 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -1,42 +1,34 @@ -use std::{io, sync::Arc, time::Duration}; +use std::io; +use std::sync::Arc; +use std::time::Duration; use async_trait::async_trait; use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer}; -use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey}; +use p256::ecdsa::SigningKey; +use p256::elliptic_curve::JwkEcKey; use rand::rngs::OsRng; use tokio::net::{lookup_host, TcpStream}; -use tracing::{debug, field::display, info}; +use tracing::field::display; +use tracing::{debug, info}; -use crate::{ - auth::{ - self, - backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo}, - check_peer_addr_is_in_list, AuthError, - }, - compute, - config::ProxyConfig, - context::RequestMonitoring, - control_plane::{ - errors::{GetAuthInfoError, WakeComputeError}, - locks::ApiLocks, - provider::ApiLockError, - CachedNodeInfo, - }, - error::{ErrorKind, ReportableError, UserFacingError}, - intern::EndpointIdInt, - proxy::{ - connect_compute::ConnectMechanism, - retry::{CouldRetry, ShouldRetryWakeCompute}, - }, - rate_limiter::EndpointRateLimiter, - EndpointId, Host, -}; - -use super::{ - conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}, - http_conn_pool::{self, poll_http2_client}, - local_conn_pool::{self, LocalClient, LocalConnPool}, -}; +use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool}; +use super::http_conn_pool::{self, poll_http2_client}; +use super::local_conn_pool::{self, LocalClient, LocalConnPool}; +use crate::auth::backend::local::StaticAuthRules; +use crate::auth::backend::{ComputeCredentials, ComputeUserInfo}; +use crate::auth::{self, check_peer_addr_is_in_list, AuthError}; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError}; +use crate::control_plane::locks::ApiLocks; +use crate::control_plane::provider::ApiLockError; +use crate::control_plane::CachedNodeInfo; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::intern::EndpointIdInt; +use crate::proxy::connect_compute::ConnectMechanism; +use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute}; +use crate::rate_limiter::EndpointRateLimiter; +use crate::{compute, EndpointId, Host}; pub(crate) struct PoolingBackend { pub(crate) http_conn_pool: Arc, diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs index 7659745473..6db986f1f7 100644 --- a/proxy/src/serverless/cancel_set.rs +++ b/proxy/src/serverless/cancel_set.rs @@ -1,10 +1,8 @@ //! A set for cancelling random http connections -use std::{ - hash::{BuildHasher, BuildHasherDefault}, - num::NonZeroUsize, - time::Duration, -}; +use std::hash::{BuildHasher, BuildHasherDefault}; +use std::num::NonZeroUsize; +use std::time::Duration; use indexmap::IndexMap; use parking_lot::Mutex; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 2e576e0ded..aa869ff1c0 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -1,33 +1,31 @@ +use std::collections::HashMap; +use std::fmt; +use std::ops::Deref; +use std::pin::pin; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; +use std::time::Duration; + use dashmap::DashMap; -use futures::{future::poll_fn, Future}; +use futures::future::poll_fn; +use futures::Future; use parking_lot::RwLock; use rand::Rng; use smallvec::SmallVec; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; -use std::{ - fmt, - task::{ready, Poll}, -}; -use std::{ - ops::Deref, - sync::atomic::{self, AtomicUsize}, -}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; +use tracing::{debug, error, info, info_span, warn, Instrument, Span}; +use super::backend::HttpConnError; +use crate::auth::backend::ComputeUserInfo; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{ - auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName, -}; - -use tracing::{debug, error, warn, Span}; -use tracing::{info, info_span, Instrument}; - -use super::backend::HttpConnError; +use crate::{DbName, EndpointCacheKey, RoleName}; #[derive(Debug, Clone)] pub(crate) struct ConnInfoWithAuth { @@ -724,13 +722,13 @@ impl Drop for Client { #[cfg(test)] mod tests { - use std::{mem, sync::atomic::AtomicBool}; - - use crate::{ - proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId, - }; + use std::mem; + use std::sync::atomic::AtomicBool; use super::*; + use crate::proxy::NeonOptions; + use crate::serverless::cancel_set::CancelSet; + use crate::{BranchId, EndpointId, ProjectId}; struct MockClient(Arc); impl MockClient { diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs index 6d61536f1a..9b6bc98557 100644 --- a/proxy/src/serverless/http_conn_pool.rs +++ b/proxy/src/serverless/http_conn_pool.rs @@ -1,22 +1,21 @@ +use std::collections::VecDeque; +use std::sync::atomic::{self, AtomicUsize}; +use std::sync::{Arc, Weak}; + use dashmap::DashMap; use hyper::client::conn::http2; use hyper_util::rt::{TokioExecutor, TokioIo}; use parking_lot::RwLock; use rand::Rng; -use std::collections::VecDeque; -use std::sync::atomic::{self, AtomicUsize}; -use std::{sync::Arc, sync::Weak}; use tokio::net::TcpStream; +use tracing::{debug, error, info, info_span, Instrument}; +use super::conn_pool::ConnInfo; +use crate::context::RequestMonitoring; use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; use crate::metrics::{HttpEndpointPoolsGuard, Metrics}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, EndpointCacheKey}; - -use tracing::{debug, error}; -use tracing::{info, info_span, Instrument}; - -use super::conn_pool::ConnInfo; +use crate::EndpointCacheKey; pub(crate) type Send = http2::SendRequest; pub(crate) type Connect = diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs index c1c5764d17..c0208d4f68 100644 --- a/proxy/src/serverless/http_util.rs +++ b/proxy/src/serverless/http_util.rs @@ -1,12 +1,11 @@ //! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility //! Will merge back in at some point in the future. -use bytes::Bytes; - use anyhow::Context; +use bytes::Bytes; use http::{Response, StatusCode}; -use http_body_util::{combinators::BoxBody, BodyExt, Full}; - +use http_body_util::combinators::BoxBody; +use http_body_util::{BodyExt, Full}; use serde::Serialize; use utils::http::error::ApiError; diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs index 9f328a0e1d..8c56d317cc 100644 --- a/proxy/src/serverless/json.rs +++ b/proxy/src/serverless/json.rs @@ -1,7 +1,5 @@ -use serde_json::Map; -use serde_json::Value; -use tokio_postgres::types::Kind; -use tokio_postgres::types::Type; +use serde_json::{Map, Value}; +use tokio_postgres::types::{Kind, Type}; use tokio_postgres::Row; // @@ -256,9 +254,10 @@ fn _pg_array_parse( #[cfg(test)] mod tests { - use super::*; use serde_json::json; + use super::*; + #[test] fn test_atomic_types_to_pg_params() { let json = vec![Value::Bool(true), Value::Bool(false)]; diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs index 4ab14ad35f..5df37a8762 100644 --- a/proxy/src/serverless/local_conn_pool.rs +++ b/proxy/src/serverless/local_conn_pool.rs @@ -1,28 +1,31 @@ -use futures::{future::poll_fn, Future}; +use std::collections::HashMap; +use std::pin::pin; +use std::sync::{Arc, Weak}; +use std::task::{ready, Poll}; +use std::time::Duration; + +use futures::future::poll_fn; +use futures::Future; use indexmap::IndexMap; use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding}; use p256::ecdsa::{Signature, SigningKey}; use parking_lot::RwLock; use serde_json::value::RawValue; use signature::Signer; -use std::task::{ready, Poll}; -use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration}; use tokio::time::Instant; use tokio_postgres::tls::NoTlsStream; use tokio_postgres::types::ToSql; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_util::sync::CancellationToken; - -use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; -use crate::metrics::Metrics; -use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; -use crate::{context::RequestMonitoring, DbName, RoleName}; - -use tracing::{error, warn, Span}; -use tracing::{info, info_span, Instrument}; +use tracing::{error, info, info_span, warn, Instrument, Span}; use super::backend::HttpConnError; use super::conn_pool::{ClientInnerExt, ConnInfo}; +use crate::context::RequestMonitoring; +use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo}; +use crate::metrics::Metrics; +use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; +use crate::{DbName, RoleName}; struct ConnPoolEntry { conn: ClientInner, diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 3131adada4..3ed3b6c845 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -12,12 +12,15 @@ mod local_conn_pool; mod sql_over_http; mod websocket; +use std::net::{IpAddr, SocketAddr}; +use std::pin::{pin, Pin}; +use std::sync::Arc; + +use anyhow::Context; use async_trait::async_trait; use atomic_take::AtomicTake; use bytes::Bytes; pub use conn_pool::GlobalConnPoolOptions; - -use anyhow::Context; use futures::future::{select, Either}; use futures::TryFutureExt; use http::{Method, Response, StatusCode}; @@ -29,9 +32,13 @@ use hyper_util::server::conn::auto::Builder; use rand::rngs::StdRng; use rand::SeedableRng; use tokio::io::{AsyncRead, AsyncWrite}; +use tokio::net::{TcpListener, TcpStream}; use tokio::time::timeout; use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; use tokio_util::task::TaskTracker; +use tracing::{info, warn, Instrument}; +use utils::http::error::ApiError; use crate::cancellation::CancellationHandlerMain; use crate::config::ProxyConfig; @@ -43,14 +50,6 @@ use crate::rate_limiter::EndpointRateLimiter; use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; -use std::net::{IpAddr, SocketAddr}; -use std::pin::{pin, Pin}; -use std::sync::Arc; -use tokio::net::{TcpListener, TcpStream}; -use tokio_util::sync::CancellationToken; -use tracing::{info, warn, Instrument}; -use utils::http::error::ApiError; - pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; pub async fn task_main( diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index cf3324926c..3d8a2adef1 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -2,77 +2,43 @@ use std::pin::pin; use std::sync::Arc; use bytes::Bytes; -use futures::future::select; -use futures::future::try_join; -use futures::future::Either; -use futures::StreamExt; -use futures::TryFutureExt; +use futures::future::{select, try_join, Either}; +use futures::{StreamExt, TryFutureExt}; use http::header::AUTHORIZATION; use http::Method; use http_body_util::combinators::BoxBody; -use http_body_util::BodyExt; -use http_body_util::Full; -use hyper::body::Body; -use hyper::body::Incoming; -use hyper::header; -use hyper::http::HeaderName; -use hyper::http::HeaderValue; -use hyper::Response; -use hyper::StatusCode; -use hyper::{HeaderMap, Request}; +use http_body_util::{BodyExt, Full}; +use hyper::body::{Body, Incoming}; +use hyper::http::{HeaderName, HeaderValue}; +use hyper::{header, HeaderMap, Request, Response, StatusCode}; use pq_proto::StartupMessageParamsBuilder; use serde::Serialize; use serde_json::Value; use tokio::time; -use tokio_postgres::error::DbError; -use tokio_postgres::error::ErrorPosition; -use tokio_postgres::error::SqlState; -use tokio_postgres::GenericClient; -use tokio_postgres::IsolationLevel; -use tokio_postgres::NoTls; -use tokio_postgres::ReadyForQueryStatus; -use tokio_postgres::Transaction; +use tokio_postgres::error::{DbError, ErrorPosition, SqlState}; +use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction}; use tokio_util::sync::CancellationToken; -use tracing::error; -use tracing::info; +use tracing::{error, info}; use typed_json::json; use url::Url; use urlencoding; use utils::http::error::ApiError; -use crate::auth::backend::ComputeCredentialKeys; -use crate::auth::backend::ComputeUserInfo; -use crate::auth::endpoint_sni; -use crate::auth::ComputeUserInfoParseError; -use crate::config::AuthenticationConfig; -use crate::config::HttpConfig; -use crate::config::ProxyConfig; -use crate::config::TlsConfig; -use crate::context::RequestMonitoring; -use crate::error::ErrorKind; -use crate::error::ReportableError; -use crate::error::UserFacingError; -use crate::metrics::HttpDirection; -use crate::metrics::Metrics; -use crate::proxy::run_until_cancelled; -use crate::proxy::NeonOptions; -use crate::serverless::backend::HttpConnError; -use crate::usage_metrics::MetricCounter; -use crate::usage_metrics::MetricCounterRecorder; -use crate::DbName; -use crate::RoleName; - -use super::backend::LocalProxyConnError; -use super::backend::PoolingBackend; -use super::conn_pool; -use super::conn_pool::AuthData; -use super::conn_pool::ConnInfo; -use super::conn_pool::ConnInfoWithAuth; +use super::backend::{LocalProxyConnError, PoolingBackend}; +use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth}; use super::http_util::json_response; -use super::json::json_to_pg_text; -use super::json::pg_text_row_to_json; -use super::json::JsonConversionError; -use super::local_conn_pool; +use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; +use super::{conn_pool, local_conn_pool}; +use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo}; +use crate::auth::{endpoint_sni, ComputeUserInfoParseError}; +use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; +use crate::context::RequestMonitoring; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::{HttpDirection, Metrics}; +use crate::proxy::{run_until_cancelled, NeonOptions}; +use crate::serverless::backend::HttpConnError; +use crate::usage_metrics::{MetricCounter, MetricCounterRecorder}; +use crate::{DbName, RoleName}; #[derive(serde::Deserialize)] #[serde(rename_all = "camelCase")] diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs index f5a692cf40..ba36116c2c 100644 --- a/proxy/src/serverless/websocket.rs +++ b/proxy/src/serverless/websocket.rs @@ -1,13 +1,7 @@ -use crate::proxy::ErrorSource; -use crate::{ - cancellation::CancellationHandlerMain, - config::ProxyConfig, - context::RequestMonitoring, - error::{io_error, ReportableError}, - metrics::Metrics, - proxy::{handle_client, ClientMode}, - rate_limiter::EndpointRateLimiter, -}; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{ready, Context, Poll}; + use anyhow::Context as _; use bytes::{Buf, BufMut, Bytes, BytesMut}; use framed_websockets::{Frame, OpCode, WebSocketServer}; @@ -15,15 +9,17 @@ use futures::{Sink, Stream}; use hyper::upgrade::OnUpgrade; use hyper_util::rt::TokioIo; use pin_project_lite::pin_project; - -use std::{ - pin::Pin, - sync::Arc, - task::{ready, Context, Poll}, -}; use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf}; use tracing::warn; +use crate::cancellation::CancellationHandlerMain; +use crate::config::ProxyConfig; +use crate::context::RequestMonitoring; +use crate::error::{io_error, ReportableError}; +use crate::metrics::Metrics; +use crate::proxy::{handle_client, ClientMode, ErrorSource}; +use crate::rate_limiter::EndpointRateLimiter; + pin_project! { /// This is a wrapper around a [`WebSocketStream`] that /// implements [`AsyncRead`] and [`AsyncWrite`]. @@ -184,14 +180,11 @@ mod tests { use framed_websockets::WebSocketServer; use futures::{SinkExt, StreamExt}; - use tokio::{ - io::{duplex, AsyncReadExt, AsyncWriteExt}, - task::JoinSet, - }; - use tokio_tungstenite::{ - tungstenite::{protocol::Role, Message}, - WebSocketStream, - }; + use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt}; + use tokio::task::JoinSet; + use tokio_tungstenite::tungstenite::protocol::Role; + use tokio_tungstenite::tungstenite::Message; + use tokio_tungstenite::WebSocketStream; use super::WebSocketRw; diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index e2fc73235e..89df48c5d3 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,19 +1,20 @@ -use crate::config::TlsServerEndPoint; -use crate::error::{ErrorKind, ReportableError, UserFacingError}; -use crate::metrics::Metrics; -use bytes::BytesMut; - -use pq_proto::framed::{ConnectionError, Framed}; -use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; -use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; + +use bytes::BytesMut; +use pq_proto::framed::{ConnectionError, Framed}; +use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError}; +use rustls::ServerConfig; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, ReadBuf}; use tokio_rustls::server::TlsStream; use tracing::debug; +use crate::config::TlsServerEndPoint; +use crate::error::{ErrorKind, ReportableError, UserFacingError}; +use crate::metrics::Metrics; + /// Stream wrapper which implements libpq's protocol. /// /// NOTE: This object deliberately doesn't implement [`AsyncRead`] diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index ee36ed462d..c5384c0b0e 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -1,36 +1,33 @@ //! Periodically collect proxy consumption metrics //! and push them to a HTTP endpoint. -use crate::{ - config::{MetricBackupCollectionConfig, MetricCollectionConfig}, - context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, - http, - intern::{BranchIdInt, EndpointIdInt}, -}; +use std::convert::Infallible; +use std::pin::pin; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Duration; + use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Bytes; use chrono::{DateTime, Datelike, Timelike, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; -use dashmap::{mapref::entry::Entry, DashMap}; +use dashmap::mapref::entry::Entry; +use dashmap::DashMap; use futures::future::select; use once_cell::sync::Lazy; use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; use serde::{Deserialize, Serialize}; -use std::{ - convert::Infallible, - pin::pin, - sync::{ - atomic::{AtomicU64, AtomicUsize, Ordering}, - Arc, - }, - time::Duration, -}; use tokio::io::AsyncWriteExt; use tokio_util::sync::CancellationToken; use tracing::{error, info, instrument, trace, warn}; use utils::backoff; use uuid::{NoContext, Timestamp}; +use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig}; +use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}; +use crate::http; +use crate::intern::{BranchIdInt, EndpointIdInt}; + const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); @@ -485,19 +482,23 @@ async fn upload_events_chunk( #[cfg(test)] mod tests { - use super::*; + use std::sync::{Arc, Mutex}; - use crate::{http, BranchId, EndpointId}; use anyhow::Error; use chrono::Utc; use consumption_metrics::{Event, EventChunk}; use http_body_util::BodyExt; - use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response}; + use hyper::body::Incoming; + use hyper::server::conn::http1; + use hyper::service::service_fn; + use hyper::{Request, Response}; use hyper_util::rt::TokioIo; - use std::sync::{Arc, Mutex}; use tokio::net::TcpListener; use url::Url; + use super::*; + use crate::{http, BranchId, EndpointId}; + #[tokio::test] async fn metrics() { type Report = EventChunk<'static, Event>; diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 86d0f9e8b2..7e07f6a2af 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,8 +1,9 @@ +use std::pin::Pin; +use std::task; + use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; -use std::pin::Pin; -use std::task; use thiserror::Error; use tokio::sync::oneshot; @@ -99,9 +100,10 @@ impl std::future::Future for Waiter<'_, T> { #[cfg(test)] mod tests { - use super::*; use std::sync::Arc; + use super::*; + #[tokio::test] async fn test_waiter() -> anyhow::Result<()> { let waiters = Arc::new(Waiters::default()); From d490ad23e0948b7c49098638ffc669774c61049e Mon Sep 17 00:00:00 2001 From: Vlad Lazar Date: Wed, 16 Oct 2024 14:04:17 +0100 Subject: [PATCH 37/48] storcon: use the same trace fields for reconciler and results (#9410) ## Problem The reconciler use `seq`, but processing of results uses `sequence`. Order is different too. It makes it annoying to read logs. ## Summary of Changes Use the same tracing fields in both --- storage_controller/src/service.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index cedee54534..25e1fb5e1f 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -1074,8 +1074,9 @@ impl Service { /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`] /// will indicate that reconciliation is not needed. #[instrument(skip_all, fields( - tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(), - sequence=%result.sequence + seq=%result.sequence, + tenant_id=%result.tenant_shard_id.tenant_id, + shard_id=%result.tenant_shard_id.shard_slug(), ))] fn process_result(&self, result: ReconcileResult) { let mut locked = self.inner.write().unwrap(); From d6281cbe65db6959e83c6d8abb44c0a3184e8b97 Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 16 Oct 2024 15:27:46 +0100 Subject: [PATCH 38/48] tests: stabilize test_timelines_parallel_endpoints (#9413) ## Problem This test would get failures like `command failed: Found no timeline id for branch name 'branch_8'` It's because neon_local is being invoked concurrently for branch creation, which is unsafe (they'll step on each others' JSON writes) Example failure: https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9410/11363051979/index.html#testresult/5ddc56c640f5422b/retries ## Summary of changes - Don't do branch creation concurrently with endpoint creation via neon_local --- test_runner/regress/test_tenants.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 4a16535941..03cb79fc1d 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -19,6 +19,7 @@ from fixtures.metrics import ( parse_metrics, ) from fixtures.neon_fixtures import ( + Endpoint, NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn, @@ -490,8 +491,8 @@ def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): n_threads = 16 barrier = threading.Barrier(n_threads) - def test_timeline(branch_name: str, timeline_id: TimelineId): - endpoint = env.endpoints.create_start(branch_name) + def test_timeline(branch_name: str, timeline_id: TimelineId, endpoint: Endpoint): + endpoint.start() endpoint.stop() # Use a barrier to make sure we restart endpoints at the same time barrier.wait() @@ -502,8 +503,12 @@ def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv): for i in range(0, n_threads): branch_name = f"branch_{i}" timeline_id = env.create_branch(branch_name) - w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id]) + endpoint = env.endpoints.create(branch_name) + w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id, endpoint]) workers.append(w) + + # Only start the restarts once we're done creating all timelines & endpoints + for w in workers: w.start() for w in workers: From 3140c14d608e79d792518d9d9144460b6ff01b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 16 Oct 2024 16:28:55 +0200 Subject: [PATCH 39/48] Remove allow(clippy::unknown_lints) (#9416) the lint stabilized in 1.80. --- pageserver/src/tenant/timeline.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 8f098d0e82..1992dee930 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -3092,7 +3092,6 @@ impl Timeline { } impl Timeline { - #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace /// From 9668601f4666bd82cee653800433ce66a4d9fb21 Mon Sep 17 00:00:00 2001 From: Anastasia Lubennikova Date: Wed, 16 Oct 2024 15:29:23 +0100 Subject: [PATCH 40/48] Add support of extensions for v17 (part 2) (#9389) - plv8 3.2.3 - HypoPG 1.4.1 - pgtap 1.3.3 - timescaledb 2.17.0 - pg_hint_plan 17_1_7_0 - rdkit Release_2024_09_1 - pg_uuidv7 1.6.0 - wal2json 2.6 - pg_ivm 1.9 - pg_partman 5.1.0 update support of extensions for v14-v16: - HypoPG 1.4.0 -> 1.4.1 - pgtap 1.2.0 -> 1.3.3 - plpgsql_check 2.5.3 -> 2.7.11 - pg_uuidv7 1.0.1 -> 1.6.0 - wal2json 2.5 -> 2.6 - pg_ivm 1.7 -> 1.9 - pg_partman 5.0.1 -> 5.1.0 --- compute/Dockerfile.compute-node | 182 ++++++++++++++++++++------------ 1 file changed, 114 insertions(+), 68 deletions(-) diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index 13381b2901..f05039f8b7 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -18,13 +18,14 @@ RUN case $DEBIAN_VERSION in \ # Version-specific installs for Bullseye (PG14-PG16): # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18. # Install newer version (3.25) from backports. + # libstdc++-10-dev is required for plv8 bullseye) \ echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \ - VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \ + VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \ ;; \ # Version-specific installs for Bookworm (PG17): bookworm) \ - VERSION_INSTALLS="cmake"; \ + VERSION_INSTALLS="cmake libstdc++-12-dev"; \ ;; \ *) \ echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \ @@ -227,18 +228,33 @@ FROM build-deps AS plv8-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt update && \ +RUN apt update && \ apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +# plv8 3.2.3 supports v17 +# last release v3.2.3 - Sep 7, 2024 +# +# clone the repo instead of downloading the release tarball because plv8 has submodule dependencies +# and the release tarball doesn't include them +# +# Use new version only for v17 +# because since v3.2, plv8 doesn't include plcoffee and plls extensions +ENV PLV8_TAG=v3.2.3 + +RUN case "${PG_VERSION}" in \ + "v17") \ + export PLV8_TAG=v3.2.3 \ + ;; \ + "v14" | "v15" | "v16") \ + export PLV8_TAG=v3.1.10 \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \ - echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \ - mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \ + git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \ + tar -czf plv8.tar.gz --exclude .git plv8-src && \ + cd plv8-src && \ # generate and copy upgrade scripts mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \ cp upgrade/* /usr/local/pgsql/share/extension/ && \ @@ -248,8 +264,17 @@ RUN case "${PG_VERSION}" in "v17") \ find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \ # don't break computes with installed old version of plv8 cd /usr/local/pgsql/lib/ && \ - ln -s plv8-3.1.10.so plv8-3.1.5.so && \ - ln -s plv8-3.1.10.so plv8-3.1.8.so && \ + case "${PG_VERSION}" in \ + "v17") \ + ln -s plv8-3.2.3.so plv8-3.1.8.so && \ + ln -s plv8-3.2.3.so plv8-3.1.5.so && \ + ln -s plv8-3.2.3.so plv8-3.1.10.so \ + ;; \ + "v14" | "v15" | "v16") \ + ln -s plv8-3.1.10.so plv8-3.1.5.so && \ + ln -s plv8-3.1.10.so plv8-3.1.8.so \ + ;; \ + esac && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control @@ -327,6 +352,9 @@ COPY compute/patches/pgvector.patch /pgvector.patch # By default, pgvector Makefile uses `-march=native`. We don't want that, # because we build the images on different machines than where we run them. # Pass OPTFLAGS="" to remove it. +# +# v17 is not supported yet because of upstream issue +# https://github.com/pgvector/pgvector/issues/669 RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -366,11 +394,10 @@ FROM build-deps AS hypopg-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \ - echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \ +# HypoPG 1.4.1 supports v17 +# last release 1.4.1 - Apr 28, 2024 +RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \ + echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \ mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -407,6 +434,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY compute/patches/rum.patch /rum.patch +# maybe version-specific +# support for v17 is unknown +# last release 1.3.13 - Sep 19, 2022 RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ @@ -428,11 +458,10 @@ FROM build-deps AS pgtap-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \ - echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \ +# pgtap 1.3.3 supports v17 +# last release v1.3.3 - Apr 8, 2024 +RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \ + echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \ mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ @@ -505,11 +534,10 @@ FROM build-deps AS plpgsql-check-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \ - echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \ +# plpgsql_check v2.7.11 supports v17 +# last release v2.7.11 - Sep 16, 2024 +RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \ + echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \ mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \ @@ -527,18 +555,19 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +RUN case "${PG_VERSION}" in \ "v14" | "v15") \ export TIMESCALEDB_VERSION=2.10.1 \ export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \ ;; \ - *) \ + "v16") \ export TIMESCALEDB_VERSION=2.13.0 \ export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \ ;; \ + "v17") \ + export TIMESCALEDB_VERSION=2.17.0 \ + export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \ + ;; \ esac && \ wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \ echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \ @@ -561,10 +590,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - case "${PG_VERSION}" in \ +# version-specific, has separate releases for each version +RUN case "${PG_VERSION}" in \ "v14") \ export PG_HINT_PLAN_VERSION=14_1_4_1 \ export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \ @@ -578,7 +605,8 @@ RUN case "${PG_VERSION}" in "v17") \ export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \ ;; \ "v17") \ - echo "TODO: PG17 pg_hint_plan support" && exit 0 \ + export PG_HINT_PLAN_VERSION=17_1_7_0 \ + export PG_HINT_PLAN_CHECKSUM=06dd306328c67a4248f48403c50444f30959fb61ebe963248dbc2afb396fe600 \ ;; \ *) \ echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \ @@ -602,6 +630,10 @@ FROM build-deps AS pg-cron-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# 1.6.4 available, supports v17 +# This is an experimental extension that we do not support on prod yet. +# !Do not remove! +# We set it in shared_preload_libraries and computes will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ @@ -623,23 +655,37 @@ FROM build-deps AS rdkit-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - apt-get update && \ +RUN apt-get update && \ apt-get install --no-install-recommends -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ libboost-system1.74-dev \ - libeigen3-dev + libeigen3-dev \ + libboost-all-dev +# rdkit Release_2024_09_1 supports v17 +# last release Release_2024_09_1 - Sep 27, 2024 +# +# Use new version only for v17 +# because Release_2024_09_1 has some backward incompatible changes +# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ +RUN case "${PG_VERSION}" in \ + "v17") \ + export RDKIT_VERSION=Release_2024_09_1 \ + export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \ + ;; \ + "v14" | "v15" | "v16") \ + export RDKIT_VERSION=Release_2023_03_3 \ + export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \ + ;; \ + *) \ + echo "unexpected PostgreSQL version" && exit 1 \ + ;; \ esac && \ - wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \ - echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \ + wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \ + echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \ mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \ cmake \ -D RDK_BUILD_CAIRO_SUPPORT=OFF \ @@ -678,12 +724,11 @@ FROM build-deps AS pg-uuidv7-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# not version-specific +# last release v1.6.0 - Oct 9, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 extensions are not supported yet. Quit" && exit 0;; \ - esac && \ - wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \ - echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \ + echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \ mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -754,6 +799,8 @@ RUN case "${PG_VERSION}" in \ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is our extension, support stopped in favor of pgvector +# TODO: deprecate it ARG PG_VERSION ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in \ @@ -780,6 +827,8 @@ FROM build-deps AS pg-anon-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# This is an experimental extension, never got to real production. +# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found. ENV PATH="/usr/local/pgsql/bin/:$PATH" RUN case "${PG_VERSION}" in "v17") \ echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \ @@ -946,13 +995,12 @@ FROM build-deps AS wal2json-pg-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# wal2json wal2json_2_6 supports v17 +# last release wal2json_2_6 - Apr 25, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \ - echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \ - mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \ +RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \ + echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \ + mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install @@ -966,12 +1014,11 @@ FROM build-deps AS pg-ivm-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# pg_ivm v1.9 supports v17 +# last release v1.9 - Jul 31 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \ - esac && \ - wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \ - echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \ + echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \ mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -987,12 +1034,11 @@ FROM build-deps AS pg-partman-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ +# should support v17 https://github.com/pgpartman/pg_partman/discussions/693 +# last release 5.1.0 Apr 2, 2024 ENV PATH="/usr/local/pgsql/bin/:$PATH" -RUN case "${PG_VERSION}" in "v17") \ - echo "pg_partman doesn't support PG17 yet" && exit 0;; \ - esac && \ - wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \ - echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \ + echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \ mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ From 55b246085ea30341f2479ecfadff374a5487e74d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 16 Oct 2024 16:47:17 +0200 Subject: [PATCH 41/48] Activate timelines during unoffload (#9399) The current code has forgotten to activate timelines during unoffload, leading to inability to receive the basebackup, due to the timeline still being in loading state. ``` stderr: command failed: compute startup failed: failed to get basebackup@0/0 from pageserver postgresql://no_user@localhost:15014 Caused by: 0: db error: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading 1: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading ``` Therefore, also activate the timeline during unoffloading. Part of #8088 --- pageserver/src/http/routes.rs | 7 +++- pageserver/src/tenant.rs | 40 +++++++++++++------- test_runner/regress/test_timeline_archive.py | 17 +++++++++ 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index dd403c1cef..36a6ed427b 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -720,7 +720,12 @@ async fn timeline_archival_config_handler( tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; tenant - .apply_timeline_archival_config(timeline_id, request_data.state, ctx) + .apply_timeline_archival_config( + timeline_id, + request_data.state, + state.broker_client.clone(), + ctx, + ) .await?; Ok::<_, ApiError>(()) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 20925c7fd6..689982ddd4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1554,6 +1554,7 @@ impl Tenant { async fn unoffload_timeline( self: &Arc, timeline_id: TimelineId, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); @@ -1605,25 +1606,37 @@ impl Tenant { }) .map_err(TimelineArchivalError::Other)?; let timelines = self.timelines.lock().unwrap(); - if let Some(timeline) = timelines.get(&timeline_id) { - let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); - if offloaded_timelines.remove(&timeline_id).is_none() { - warn!("timeline already removed from offloaded timelines"); - } - info!("timeline unoffloading complete"); - Ok(Arc::clone(timeline)) - } else { + let Some(timeline) = timelines.get(&timeline_id) else { warn!("timeline not available directly after attach"); - Err(TimelineArchivalError::Other(anyhow::anyhow!( + return Err(TimelineArchivalError::Other(anyhow::anyhow!( "timeline not available directly after attach" - ))) + ))); + }; + let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap(); + if offloaded_timelines.remove(&timeline_id).is_none() { + warn!("timeline already removed from offloaded timelines"); } + + // Activate the timeline (if it makes sense) + if !(timeline.is_broken() || timeline.is_stopping()) { + let background_jobs_can_start = None; + timeline.activate( + self.clone(), + broker_client.clone(), + background_jobs_can_start, + &ctx, + ); + } + + info!("timeline unoffloading complete"); + Ok(Arc::clone(timeline)) } pub(crate) async fn apply_timeline_archival_config( self: &Arc, timeline_id: TimelineId, new_state: TimelineArchivalState, + broker_client: storage_broker::BrokerClientChannel, ctx: RequestContext, ) -> Result<(), TimelineArchivalError> { info!("setting timeline archival config"); @@ -1664,12 +1677,13 @@ impl Tenant { Some(Arc::clone(timeline)) }; - // Second part: unarchive timeline (if needed) + // Second part: unoffload timeline (if needed) let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded { timeline } else { // Turn offloaded timeline into a non-offloaded one - self.unoffload_timeline(timeline_id, ctx).await? + self.unoffload_timeline(timeline_id, broker_client, ctx) + .await? }; // Third part: upload new timeline archival state and block until it is present in S3 @@ -3354,7 +3368,7 @@ impl Tenant { /// Populate all Timelines' `GcInfo` with information about their children. We do not set the /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`] /// - /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion. + /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`]. fn initialize_gc_info( &self, timelines: &std::sync::MutexGuard>>, diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index 971cc57a1c..ffaed5e130 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -136,6 +136,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" ) + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + endpoint.safe_psql_many( + [ + "CREATE TABLE foo(key serial primary key, t text default 'data_content')", + "INSERT INTO foo SELECT FROM generate_series(1,1000)", + ] + ) + sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + ps_http.timeline_archival_config( tenant_id, leaf_timeline_id, @@ -197,4 +208,10 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b ) assert leaf_detail["is_archived"] is False + with env.endpoints.create_start( + "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + assert sum == sum_again + assert not timeline_offloaded(initial_timeline_id) From 8a114e3aeda7a2e321fa4524335c1748448cae07 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Wed, 16 Oct 2024 11:19:45 -0400 Subject: [PATCH 42/48] refactor(pageserver): upgrade remote_storage to use hyper1 (#9405) part of https://github.com/neondatabase/neon/issues/9255 ## Summary of changes Upgrade remote_storage crate to use hyper1. Hyper0 is used when providing the streaming HTTP body to the s3 SDK, and it is refactored to use hyper1. Signed-off-by: Alex Chi Z --- Cargo.lock | 3 ++- libs/remote_storage/Cargo.toml | 3 ++- libs/remote_storage/src/s3_bucket.rs | 8 +++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e772814ec..6b212bac2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4648,9 +4648,10 @@ dependencies = [ "camino-tempfile", "futures", "futures-util", + "http-body-util", "http-types", "humantime-serde", - "hyper 0.14.30", + "hyper 1.4.1", "itertools 0.10.5", "metrics", "once_cell", diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index be4d61f009..1816825bda 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true -hyper0 = { workspace = true, features = ["stream"] } +hyper = { workspace = true, features = ["client"] } futures.workspace = true serde.workspace = true serde_json.workspace = true @@ -36,6 +36,7 @@ azure_storage.workspace = true azure_storage_blobs.workspace = true futures-util.workspace = true http-types.workspace = true +http-body-util.workspace = true itertools.workspace = true sync_wrapper = { workspace = true, features = ["futures"] } diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index f950f2886c..cde32df402 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -28,13 +28,15 @@ use aws_sdk_s3::{ Client, }; use aws_smithy_async::rt::sleep::TokioSleep; +use http_body_util::StreamBody; use http_types::StatusCode; use aws_smithy_types::{body::SdkBody, DateTime}; use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; -use hyper0::Body; +use futures_util::StreamExt; +use hyper::body::Frame; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -710,8 +712,8 @@ impl RemoteStorage for S3Bucket { let started_at = start_measuring_requests(kind); - let body = Body::wrap_stream(from); - let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body)); + let body = StreamBody::new(from.map(|x| x.map(Frame::data))); + let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body)); let upload = self .client From ed694732e707b15592991902c89f5078935ec177 Mon Sep 17 00:00:00 2001 From: Folke Behrens Date: Wed, 16 Oct 2024 19:10:49 +0200 Subject: [PATCH 43/48] proxy: merge AuthError and AuthErrorImpl (#9418) Since GetAuthInfoError now boxes the ControlPlaneError message the variant is not big anymore and AuthError is 32 bytes. --- proxy/src/auth/flow.rs | 10 +++--- proxy/src/auth/mod.rs | 78 ++++++++++++++++++------------------------ 2 files changed, 39 insertions(+), 49 deletions(-) diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs index ccb17b66b9..6294549ff6 100644 --- a/proxy/src/auth/flow.rs +++ b/proxy/src/auth/flow.rs @@ -9,7 +9,7 @@ use tokio::io::{AsyncRead, AsyncWrite}; use tracing::info; use super::backend::ComputeCredentialKeys; -use super::{AuthErrorImpl, PasswordHackPayload}; +use super::{AuthError, PasswordHackPayload}; use crate::config::TlsServerEndPoint; use crate::context::RequestMonitoring; use crate::control_plane::AuthSecret; @@ -117,14 +117,14 @@ impl AuthFlow<'_, S, PasswordHack> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let payload = PasswordHackPayload::parse(password) // If we ended up here and the payload is malformed, it means that // the user neither enabled SNI nor resorted to any other method // for passing the project name we rely on. We should show them // the most helpful error message and point to the documentation. - .ok_or(AuthErrorImpl::MissingEndpointName)?; + .ok_or(AuthError::MissingEndpointName)?; Ok(payload) } @@ -136,7 +136,7 @@ impl AuthFlow<'_, S, CleartextPassword> { let msg = self.stream.read_password_message().await?; let password = msg .strip_suffix(&[0]) - .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?; + .ok_or(AuthError::MalformedPassword("missing terminator"))?; let outcome = validate_password_and_exchange( &self.state.pool, @@ -166,7 +166,7 @@ impl AuthFlow<'_, S, Scram<'_>> { // Initial client message contains the chosen auth method's name. let msg = self.stream.read_password_message().await?; let sasl = sasl::FirstMessage::parse(&msg) - .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?; + .ok_or(AuthError::MalformedPassword("bad sasl message"))?; // Currently, the only supported SASL method is SCRAM. if !scram::METHODS.contains(&sasl.method) { diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs index ff97e6c35d..7a373dd825 100644 --- a/proxy/src/auth/mod.rs +++ b/proxy/src/auth/mod.rs @@ -29,7 +29,7 @@ pub(crate) type Result = std::result::Result; /// Common authentication error. #[derive(Debug, Error)] -pub(crate) enum AuthErrorImpl { +pub(crate) enum AuthError { #[error(transparent)] Web(#[from] backend::WebAuthError), @@ -78,80 +78,70 @@ pub(crate) enum AuthErrorImpl { ConfirmationTimeout(humantime::Duration), } -#[derive(Debug, Error)] -#[error(transparent)] -pub(crate) struct AuthError(Box); - impl AuthError { pub(crate) fn bad_auth_method(name: impl Into>) -> Self { - AuthErrorImpl::BadAuthMethod(name.into()).into() + AuthError::BadAuthMethod(name.into()) } pub(crate) fn auth_failed(user: impl Into>) -> Self { - AuthErrorImpl::AuthFailed(user.into()).into() + AuthError::AuthFailed(user.into()) } pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self { - AuthErrorImpl::IpAddressNotAllowed(ip).into() + AuthError::IpAddressNotAllowed(ip) } pub(crate) fn too_many_connections() -> Self { - AuthErrorImpl::TooManyConnections.into() + AuthError::TooManyConnections } pub(crate) fn is_auth_failed(&self) -> bool { - matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_)) + matches!(self, AuthError::AuthFailed(_)) } pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { - AuthErrorImpl::UserTimeout(elapsed).into() + AuthError::UserTimeout(elapsed) } pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self { - AuthErrorImpl::ConfirmationTimeout(timeout).into() - } -} - -impl> From for AuthError { - fn from(e: E) -> Self { - Self(Box::new(e.into())) + AuthError::ConfirmationTimeout(timeout) } } impl UserFacingError for AuthError { fn to_string_client(&self) -> String { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.to_string_client(), - AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(), - AuthErrorImpl::Sasl(e) => e.to_string_client(), - AuthErrorImpl::AuthFailed(_) => self.to_string(), - AuthErrorImpl::BadAuthMethod(_) => self.to_string(), - AuthErrorImpl::MalformedPassword(_) => self.to_string(), - AuthErrorImpl::MissingEndpointName => self.to_string(), - AuthErrorImpl::Io(_) => "Internal error".to_string(), - AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), - AuthErrorImpl::TooManyConnections => self.to_string(), - AuthErrorImpl::UserTimeout(_) => self.to_string(), - AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(), + match self { + Self::Web(e) => e.to_string_client(), + Self::GetAuthInfo(e) => e.to_string_client(), + Self::Sasl(e) => e.to_string_client(), + Self::AuthFailed(_) => self.to_string(), + Self::BadAuthMethod(_) => self.to_string(), + Self::MalformedPassword(_) => self.to_string(), + Self::MissingEndpointName => self.to_string(), + Self::Io(_) => "Internal error".to_string(), + Self::IpAddressNotAllowed(_) => self.to_string(), + Self::TooManyConnections => self.to_string(), + Self::UserTimeout(_) => self.to_string(), + Self::ConfirmationTimeout(_) => self.to_string(), } } } impl ReportableError for AuthError { fn get_error_kind(&self) -> crate::error::ErrorKind { - match self.0.as_ref() { - AuthErrorImpl::Web(e) => e.get_error_kind(), - AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(), - AuthErrorImpl::Sasl(e) => e.get_error_kind(), - AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User, - AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User, - AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect, - AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, - AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, - AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, - AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User, + match self { + Self::Web(e) => e.get_error_kind(), + Self::GetAuthInfo(e) => e.get_error_kind(), + Self::Sasl(e) => e.get_error_kind(), + Self::AuthFailed(_) => crate::error::ErrorKind::User, + Self::BadAuthMethod(_) => crate::error::ErrorKind::User, + Self::MalformedPassword(_) => crate::error::ErrorKind::User, + Self::MissingEndpointName => crate::error::ErrorKind::User, + Self::Io(_) => crate::error::ErrorKind::ClientDisconnect, + Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, + Self::TooManyConnections => crate::error::ErrorKind::RateLimit, + Self::UserTimeout(_) => crate::error::ErrorKind::User, + Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User, } } } From 0551cfb6a74258537255af18428b0345f24f2702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Wed, 16 Oct 2024 20:04:56 +0200 Subject: [PATCH 44/48] Fix beta clippy warnings (#9419) ``` warning: first doc comment paragraph is too long --> compute_tools/src/installed_extensions.rs:35:1 | 35 | / /// Connect to every database (see list_dbs above) and get the list of installed extensions. 36 | | /// Same extension can be installed in multiple databases with different versions, 37 | | /// we only keep the highest and lowest version across all databases. | |_ | = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#too_long_first_doc_paragraph = note: `#[warn(clippy::too_long_first_doc_paragraph)]` on by default help: add an empty line | 35 ~ /// Connect to every database (see list_dbs above) and get the list of installed extensions. 36 + /// | ``` --- compute_tools/src/installed_extensions.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs index 3d8b22a8a3..72578b1f34 100644 --- a/compute_tools/src/installed_extensions.rs +++ b/compute_tools/src/installed_extensions.rs @@ -33,6 +33,7 @@ fn list_dbs(client: &mut Client) -> Result> { } /// Connect to every database (see list_dbs above) and get the list of installed extensions. +/// /// Same extension can be installed in multiple databases with different versions, /// we only keep the highest and lowest version across all databases. pub async fn get_installed_extensions(connstr: Url) -> Result { From 409a286eaa6f030494c8914fcaa36dcc7d6496d1 Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 16 Oct 2024 13:08:40 -0500 Subject: [PATCH 45/48] Fix typo in sql_exporter generator Bad copy-paste seemingly. This manifested itself as a failure to start for the sql_exporter, and was just dying on loop in staging. A future PR will have E2E testing of sql_exporter. Signed-off-by: Tristan Partin --- compute/etc/sql_exporter.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet index 1e3665ac47..640e2ac38d 100644 --- a/compute/etc/sql_exporter.jsonnet +++ b/compute/etc/sql_exporter.jsonnet @@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') { // Collectors (referenced by name) to execute on the target. // Glob patterns are supported (see for syntax). collectors: [ - 'neon_collector_autoscaling', + 'neon_collector', ], }, From e0fa6bcf1a9a33929cfcfd0cefada739a8fe6fea Mon Sep 17 00:00:00 2001 From: Tristan Partin Date: Wed, 16 Oct 2024 14:46:33 -0500 Subject: [PATCH 46/48] Fix some sql_exporter metrics for PG 17 Checkpointer related statistics moved from pg_stat_bgwriter to pg_stat_checkpointer, so we need to adjust our queries accordingly. Signed-off-by: Tristan Partin --- compute/Dockerfile.compute-node | 3 ++- compute/Makefile | 6 ++++-- compute/etc/sql_exporter/checkpoints_req.17.sql | 1 + .../etc/sql_exporter/checkpoints_req.libsonnet | 7 ++++++- .../etc/sql_exporter/checkpoints_timed.17.sql | 1 + .../etc/sql_exporter/checkpoints_timed.libsonnet | 7 ++++++- compute/jsonnet/neon.libsonnet | 16 ++++++++++++++++ 7 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 compute/etc/sql_exporter/checkpoints_req.17.sql create mode 100644 compute/etc/sql_exporter/checkpoints_timed.17.sql create mode 100644 compute/jsonnet/neon.libsonnet diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index f05039f8b7..b0ce7c1718 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -1221,12 +1221,13 @@ RUN rm /usr/local/pgsql/lib/lib*.a # ######################################################################################### FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor +ARG PG_VERSION USER nonroot COPY --chown=nonroot compute compute -RUN make -C compute +RUN make PG_VERSION="${PG_VERSION}" -C compute ######################################################################################### # diff --git a/compute/Makefile b/compute/Makefile index f8faa882ee..e4f08a223c 100644 --- a/compute/Makefile +++ b/compute/Makefile @@ -6,13 +6,15 @@ jsonnet_files = $(wildcard \ all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml neon_collector.yml: $(jsonnet_files) - JSONNET_PATH=etc jsonnet \ + JSONNET_PATH=jsonnet:etc jsonnet \ --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ etc/neon_collector.jsonnet neon_collector_autoscaling.yml: $(jsonnet_files) - JSONNET_PATH=etc jsonnet \ + JSONNET_PATH=jsonnet:etc jsonnet \ --output-file etc/$@ \ + --ext-str pg_version=$(PG_VERSION) \ etc/neon_collector_autoscaling.jsonnet sql_exporter.yml: $(jsonnet_files) diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql new file mode 100644 index 0000000000..a4b946e8e2 --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_req.17.sql @@ -0,0 +1 @@ +SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet index 8697f8af3b..e5d9753507 100644 --- a/compute/etc/sql_exporter/checkpoints_req.libsonnet +++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet @@ -1,3 +1,8 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + { metric_name: 'checkpoints_req', type: 'gauge', @@ -6,5 +11,5 @@ values: [ 'checkpoints_req', ], - query: importstr 'sql_exporter/checkpoints_req.sql', + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, } diff --git a/compute/etc/sql_exporter/checkpoints_timed.17.sql b/compute/etc/sql_exporter/checkpoints_timed.17.sql new file mode 100644 index 0000000000..0d86ddb3ea --- /dev/null +++ b/compute/etc/sql_exporter/checkpoints_timed.17.sql @@ -0,0 +1 @@ +SELECT num_timed AS checkpoints_timed FROM pg_stat_checkpointer; diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet index 9f0b742400..0ba0080188 100644 --- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet +++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet @@ -1,3 +1,8 @@ +local neon = import 'neon.libsonnet'; + +local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql'; +local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql'; + { metric_name: 'checkpoints_timed', type: 'gauge', @@ -6,5 +11,5 @@ values: [ 'checkpoints_timed', ], - query: importstr 'sql_exporter/checkpoints_timed.sql', + query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer, } diff --git a/compute/jsonnet/neon.libsonnet b/compute/jsonnet/neon.libsonnet new file mode 100644 index 0000000000..583b631c58 --- /dev/null +++ b/compute/jsonnet/neon.libsonnet @@ -0,0 +1,16 @@ +local MIN_SUPPORTED_VERSION = 14; +local MAX_SUPPORTED_VERSION = 17; +local SUPPORTED_VERSIONS = std.range(MIN_SUPPORTED_VERSION, MAX_SUPPORTED_VERSION); + +# If we receive the pg_version with a leading "v", ditch it. +local pg_version = std.strReplace(std.extVar('pg_version'), 'v', ''); +local pg_version_num = std.parseInt(pg_version); + +assert std.setMember(pg_version_num, SUPPORTED_VERSIONS) : + std.format('%s is an unsupported Postgres version: %s', + [pg_version, std.toString(SUPPORTED_VERSIONS)]); + +{ + PG_MAJORVERSION: pg_version, + PG_MAJORVERSION_NUM: pg_version_num, +} From 67d5d98b1960c7f7b88d1f9860cd9672411cb815 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 16 Oct 2024 21:47:53 +0200 Subject: [PATCH 47/48] readme: fix build instructions for debian 12 (#9371) We need libprotobuf-dev for some of the `/usr/include/google/protobuf/...*.proto` referenced by our protobuf decls. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cfc63b4708..e68ef70bdf 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev +libprotobuf-dev libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev ``` * On Fedora, these packages are needed: ```bash From 934dbb61f557477512b3cf5c98e9930e5745d87e Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 17 Oct 2024 08:04:57 +0300 Subject: [PATCH 48/48] Check access_count in lfc_evict (#9407) ## Problem See https://neondb.slack.com/archives/C033A2WE6BZ/p1729007738526309?thread_ts=1722942856.987979&cid=C033A2WE6BZ When replica receives WAL record which target page is not present in shared buffer, we evict this page from LFC. If all pages from the LFC chunk are evicted, then chunk is moved to the beginning of LRU least to force it reuse. Unfortunately access_count is not checked and if the entry is access at this moment then this operation can cause LRU list corruption. ## Summary of changes Check `access_count` in `lfc_evict` ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist Co-authored-by: Konstantin Knizhnik --- pgxn/neon/file_cache.c | 45 ++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index bbea5a8b0d..70b250d394 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -617,31 +617,34 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno) /* remove the page from the cache */ entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1))); - /* - * If the chunk has no live entries, we can position the chunk to be - * recycled first. - */ - if (entry->bitmap[chunk_offs >> 5] == 0) + if (entry->access_count == 0) { - bool has_remaining_pages = false; - - for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) - { - if (entry->bitmap[i] != 0) - { - has_remaining_pages = true; - break; - } - } - /* - * Put the entry at the position that is first to be reclaimed when we - * have no cached pages remaining in the chunk + * If the chunk has no live entries, we can position the chunk to be + * recycled first. */ - if (!has_remaining_pages) + if (entry->bitmap[chunk_offs >> 5] == 0) { - dlist_delete(&entry->list_node); - dlist_push_head(&lfc_ctl->lru, &entry->list_node); + bool has_remaining_pages = false; + + for (int i = 0; i < CHUNK_BITMAP_SIZE; i++) + { + if (entry->bitmap[i] != 0) + { + has_remaining_pages = true; + break; + } + } + + /* + * Put the entry at the position that is first to be reclaimed when we + * have no cached pages remaining in the chunk + */ + if (!has_remaining_pages) + { + dlist_delete(&entry->list_node); + dlist_push_head(&lfc_ctl->lru, &entry->list_node); + } } }